1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 35 /* these are temporary issues to be dealt with */ 36 #define KMP_USE_PRCTL 0 37 38 #if KMP_OS_WINDOWS 39 #include <process.h> 40 #endif 41 42 #include "tsan_annotations.h" 43 44 #if defined(KMP_GOMP_COMPAT) 45 char const __kmp_version_alt_comp[] = 46 KMP_VERSION_PREFIX "alternative compiler support: yes"; 47 #endif /* defined(KMP_GOMP_COMPAT) */ 48 49 char const __kmp_version_omp_api[] = 50 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 51 52 #ifdef KMP_DEBUG 53 char const __kmp_version_lock[] = 54 KMP_VERSION_PREFIX "lock type: run time selectable"; 55 #endif /* KMP_DEBUG */ 56 57 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 58 59 /* ------------------------------------------------------------------------ */ 60 61 #if KMP_USE_MONITOR 62 kmp_info_t __kmp_monitor; 63 #endif 64 65 /* Forward declarations */ 66 67 void __kmp_cleanup(void); 68 69 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 70 int gtid); 71 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 72 kmp_internal_control_t *new_icvs, 73 ident_t *loc); 74 #if KMP_AFFINITY_SUPPORTED 75 static void __kmp_partition_places(kmp_team_t *team, 76 int update_master_only = 0); 77 #endif 78 static void __kmp_do_serial_initialize(void); 79 void __kmp_fork_barrier(int gtid, int tid); 80 void __kmp_join_barrier(int gtid); 81 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 82 kmp_internal_control_t *new_icvs, ident_t *loc); 83 84 #ifdef USE_LOAD_BALANCE 85 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 86 #endif 87 88 static int __kmp_expand_threads(int nNeed); 89 #if KMP_OS_WINDOWS 90 static int __kmp_unregister_root_other_thread(int gtid); 91 #endif 92 static void __kmp_unregister_library(void); // called by __kmp_internal_end() 93 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 94 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 95 96 /* Calculate the identifier of the current thread */ 97 /* fast (and somewhat portable) way to get unique identifier of executing 98 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 99 int __kmp_get_global_thread_id() { 100 int i; 101 kmp_info_t **other_threads; 102 size_t stack_data; 103 char *stack_addr; 104 size_t stack_size; 105 char *stack_base; 106 107 KA_TRACE( 108 1000, 109 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 110 __kmp_nth, __kmp_all_nth)); 111 112 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 113 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 114 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 115 __kmp_init_gtid for this to work. */ 116 117 if (!TCR_4(__kmp_init_gtid)) 118 return KMP_GTID_DNE; 119 120 #ifdef KMP_TDATA_GTID 121 if (TCR_4(__kmp_gtid_mode) >= 3) { 122 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 123 return __kmp_gtid; 124 } 125 #endif 126 if (TCR_4(__kmp_gtid_mode) >= 2) { 127 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 128 return __kmp_gtid_get_specific(); 129 } 130 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 131 132 stack_addr = (char *)&stack_data; 133 other_threads = __kmp_threads; 134 135 /* ATT: The code below is a source of potential bugs due to unsynchronized 136 access to __kmp_threads array. For example: 137 1. Current thread loads other_threads[i] to thr and checks it, it is 138 non-NULL. 139 2. Current thread is suspended by OS. 140 3. Another thread unregisters and finishes (debug versions of free() 141 may fill memory with something like 0xEF). 142 4. Current thread is resumed. 143 5. Current thread reads junk from *thr. 144 TODO: Fix it. --ln */ 145 146 for (i = 0; i < __kmp_threads_capacity; i++) { 147 148 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 149 if (!thr) 150 continue; 151 152 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 153 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 154 155 /* stack grows down -- search through all of the active threads */ 156 157 if (stack_addr <= stack_base) { 158 size_t stack_diff = stack_base - stack_addr; 159 160 if (stack_diff <= stack_size) { 161 /* The only way we can be closer than the allocated */ 162 /* stack size is if we are running on this thread. */ 163 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 164 return i; 165 } 166 } 167 } 168 169 /* get specific to try and determine our gtid */ 170 KA_TRACE(1000, 171 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 172 "thread, using TLS\n")); 173 i = __kmp_gtid_get_specific(); 174 175 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 176 177 /* if we havn't been assigned a gtid, then return code */ 178 if (i < 0) 179 return i; 180 181 /* dynamically updated stack window for uber threads to avoid get_specific 182 call */ 183 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 184 KMP_FATAL(StackOverflow, i); 185 } 186 187 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 188 if (stack_addr > stack_base) { 189 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 190 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 191 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 192 stack_base); 193 } else { 194 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 195 stack_base - stack_addr); 196 } 197 198 /* Reprint stack bounds for ubermaster since they have been refined */ 199 if (__kmp_storage_map) { 200 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 201 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 202 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 203 other_threads[i]->th.th_info.ds.ds_stacksize, 204 "th_%d stack (refinement)", i); 205 } 206 return i; 207 } 208 209 int __kmp_get_global_thread_id_reg() { 210 int gtid; 211 212 if (!__kmp_init_serial) { 213 gtid = KMP_GTID_DNE; 214 } else 215 #ifdef KMP_TDATA_GTID 216 if (TCR_4(__kmp_gtid_mode) >= 3) { 217 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 218 gtid = __kmp_gtid; 219 } else 220 #endif 221 if (TCR_4(__kmp_gtid_mode) >= 2) { 222 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 223 gtid = __kmp_gtid_get_specific(); 224 } else { 225 KA_TRACE(1000, 226 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 227 gtid = __kmp_get_global_thread_id(); 228 } 229 230 /* we must be a new uber master sibling thread */ 231 if (gtid == KMP_GTID_DNE) { 232 KA_TRACE(10, 233 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 234 "Registering a new gtid.\n")); 235 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 236 if (!__kmp_init_serial) { 237 __kmp_do_serial_initialize(); 238 gtid = __kmp_gtid_get_specific(); 239 } else { 240 gtid = __kmp_register_root(FALSE); 241 } 242 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 243 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 244 } 245 246 KMP_DEBUG_ASSERT(gtid >= 0); 247 248 return gtid; 249 } 250 251 /* caller must hold forkjoin_lock */ 252 void __kmp_check_stack_overlap(kmp_info_t *th) { 253 int f; 254 char *stack_beg = NULL; 255 char *stack_end = NULL; 256 int gtid; 257 258 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 259 if (__kmp_storage_map) { 260 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 261 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 262 263 gtid = __kmp_gtid_from_thread(th); 264 265 if (gtid == KMP_GTID_MONITOR) { 266 __kmp_print_storage_map_gtid( 267 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 268 "th_%s stack (%s)", "mon", 269 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 270 } else { 271 __kmp_print_storage_map_gtid( 272 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 273 "th_%d stack (%s)", gtid, 274 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 275 } 276 } 277 278 /* No point in checking ubermaster threads since they use refinement and 279 * cannot overlap */ 280 gtid = __kmp_gtid_from_thread(th); 281 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 282 KA_TRACE(10, 283 ("__kmp_check_stack_overlap: performing extensive checking\n")); 284 if (stack_beg == NULL) { 285 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 286 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 287 } 288 289 for (f = 0; f < __kmp_threads_capacity; f++) { 290 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 291 292 if (f_th && f_th != th) { 293 char *other_stack_end = 294 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 295 char *other_stack_beg = 296 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 297 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 298 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 299 300 /* Print the other stack values before the abort */ 301 if (__kmp_storage_map) 302 __kmp_print_storage_map_gtid( 303 -1, other_stack_beg, other_stack_end, 304 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 305 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 306 307 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 308 __kmp_msg_null); 309 } 310 } 311 } 312 } 313 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 314 } 315 316 /* ------------------------------------------------------------------------ */ 317 318 void __kmp_infinite_loop(void) { 319 static int done = FALSE; 320 321 while (!done) { 322 KMP_YIELD(TRUE); 323 } 324 } 325 326 #define MAX_MESSAGE 512 327 328 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 329 char const *format, ...) { 330 char buffer[MAX_MESSAGE]; 331 va_list ap; 332 333 va_start(ap, format); 334 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 335 p2, (unsigned long)size, format); 336 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 337 __kmp_vprintf(kmp_err, buffer, ap); 338 #if KMP_PRINT_DATA_PLACEMENT 339 int node; 340 if (gtid >= 0) { 341 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 342 if (__kmp_storage_map_verbose) { 343 node = __kmp_get_host_node(p1); 344 if (node < 0) /* doesn't work, so don't try this next time */ 345 __kmp_storage_map_verbose = FALSE; 346 else { 347 char *last; 348 int lastNode; 349 int localProc = __kmp_get_cpu_from_gtid(gtid); 350 351 const int page_size = KMP_GET_PAGE_SIZE(); 352 353 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 354 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 355 if (localProc >= 0) 356 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 357 localProc >> 1); 358 else 359 __kmp_printf_no_lock(" GTID %d\n", gtid); 360 #if KMP_USE_PRCTL 361 /* The more elaborate format is disabled for now because of the prctl 362 * hanging bug. */ 363 do { 364 last = p1; 365 lastNode = node; 366 /* This loop collates adjacent pages with the same host node. */ 367 do { 368 (char *)p1 += page_size; 369 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 370 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 371 lastNode); 372 } while (p1 <= p2); 373 #else 374 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 375 (char *)p1 + (page_size - 1), 376 __kmp_get_host_node(p1)); 377 if (p1 < p2) { 378 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 379 (char *)p2 + (page_size - 1), 380 __kmp_get_host_node(p2)); 381 } 382 #endif 383 } 384 } 385 } else 386 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 387 } 388 #endif /* KMP_PRINT_DATA_PLACEMENT */ 389 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 390 } 391 392 void __kmp_warn(char const *format, ...) { 393 char buffer[MAX_MESSAGE]; 394 va_list ap; 395 396 if (__kmp_generate_warnings == kmp_warnings_off) { 397 return; 398 } 399 400 va_start(ap, format); 401 402 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 403 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 404 __kmp_vprintf(kmp_err, buffer, ap); 405 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 406 407 va_end(ap); 408 } 409 410 void __kmp_abort_process() { 411 // Later threads may stall here, but that's ok because abort() will kill them. 412 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 413 414 if (__kmp_debug_buf) { 415 __kmp_dump_debug_buffer(); 416 } 417 418 if (KMP_OS_WINDOWS) { 419 // Let other threads know of abnormal termination and prevent deadlock 420 // if abort happened during library initialization or shutdown 421 __kmp_global.g.g_abort = SIGABRT; 422 423 /* On Windows* OS by default abort() causes pop-up error box, which stalls 424 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 425 boxes. _set_abort_behavior() works well, but this function is not 426 available in VS7 (this is not problem for DLL, but it is a problem for 427 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 428 help, at least in some versions of MS C RTL. 429 430 It seems following sequence is the only way to simulate abort() and 431 avoid pop-up error box. */ 432 raise(SIGABRT); 433 _exit(3); // Just in case, if signal ignored, exit anyway. 434 } else { 435 abort(); 436 } 437 438 __kmp_infinite_loop(); 439 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 440 441 } // __kmp_abort_process 442 443 void __kmp_abort_thread(void) { 444 // TODO: Eliminate g_abort global variable and this function. 445 // In case of abort just call abort(), it will kill all the threads. 446 __kmp_infinite_loop(); 447 } // __kmp_abort_thread 448 449 /* Print out the storage map for the major kmp_info_t thread data structures 450 that are allocated together. */ 451 452 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 453 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 454 gtid); 455 456 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 457 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 458 459 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 460 sizeof(kmp_local_t), "th_%d.th_local", gtid); 461 462 __kmp_print_storage_map_gtid( 463 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 464 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 465 466 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 467 &thr->th.th_bar[bs_plain_barrier + 1], 468 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 469 gtid); 470 471 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 472 &thr->th.th_bar[bs_forkjoin_barrier + 1], 473 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 474 gtid); 475 476 #if KMP_FAST_REDUCTION_BARRIER 477 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 478 &thr->th.th_bar[bs_reduction_barrier + 1], 479 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 480 gtid); 481 #endif // KMP_FAST_REDUCTION_BARRIER 482 } 483 484 /* Print out the storage map for the major kmp_team_t team data structures 485 that are allocated together. */ 486 487 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 488 int team_id, int num_thr) { 489 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 490 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 491 header, team_id); 492 493 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 494 &team->t.t_bar[bs_last_barrier], 495 sizeof(kmp_balign_team_t) * bs_last_barrier, 496 "%s_%d.t_bar", header, team_id); 497 498 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 499 &team->t.t_bar[bs_plain_barrier + 1], 500 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 501 header, team_id); 502 503 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 504 &team->t.t_bar[bs_forkjoin_barrier + 1], 505 sizeof(kmp_balign_team_t), 506 "%s_%d.t_bar[forkjoin]", header, team_id); 507 508 #if KMP_FAST_REDUCTION_BARRIER 509 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 510 &team->t.t_bar[bs_reduction_barrier + 1], 511 sizeof(kmp_balign_team_t), 512 "%s_%d.t_bar[reduction]", header, team_id); 513 #endif // KMP_FAST_REDUCTION_BARRIER 514 515 __kmp_print_storage_map_gtid( 516 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 517 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 518 519 __kmp_print_storage_map_gtid( 520 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 521 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 522 523 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 524 &team->t.t_disp_buffer[num_disp_buff], 525 sizeof(dispatch_shared_info_t) * num_disp_buff, 526 "%s_%d.t_disp_buffer", header, team_id); 527 } 528 529 static void __kmp_init_allocator() { __kmp_init_memkind(); } 530 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 531 532 /* ------------------------------------------------------------------------ */ 533 534 #if KMP_DYNAMIC_LIB 535 #if KMP_OS_WINDOWS 536 537 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) { 538 // TODO: Change to __kmp_break_bootstrap_lock(). 539 __kmp_init_bootstrap_lock(lck); // make the lock released 540 } 541 542 static void __kmp_reset_locks_on_process_detach(int gtid_req) { 543 int i; 544 int thread_count; 545 546 // PROCESS_DETACH is expected to be called by a thread that executes 547 // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one 548 // calling ProcessExit or FreeLibrary). So, it might be safe to access the 549 // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some 550 // threads can be still alive here, although being about to be terminated. The 551 // threads in the array with ds_thread==0 are most suspicious. Actually, it 552 // can be not safe to access the __kmp_threads[]. 553 554 // TODO: does it make sense to check __kmp_roots[] ? 555 556 // Let's check that there are no other alive threads registered with the OMP 557 // lib. 558 while (1) { 559 thread_count = 0; 560 for (i = 0; i < __kmp_threads_capacity; ++i) { 561 if (!__kmp_threads) 562 continue; 563 kmp_info_t *th = __kmp_threads[i]; 564 if (th == NULL) 565 continue; 566 int gtid = th->th.th_info.ds.ds_gtid; 567 if (gtid == gtid_req) 568 continue; 569 if (gtid < 0) 570 continue; 571 DWORD exit_val; 572 int alive = __kmp_is_thread_alive(th, &exit_val); 573 if (alive) { 574 ++thread_count; 575 } 576 } 577 if (thread_count == 0) 578 break; // success 579 } 580 581 // Assume that I'm alone. Now it might be safe to check and reset locks. 582 // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset. 583 __kmp_reset_lock(&__kmp_forkjoin_lock); 584 #ifdef KMP_DEBUG 585 __kmp_reset_lock(&__kmp_stdio_lock); 586 #endif // KMP_DEBUG 587 } 588 589 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 590 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 591 592 switch (fdwReason) { 593 594 case DLL_PROCESS_ATTACH: 595 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 596 597 return TRUE; 598 599 case DLL_PROCESS_DETACH: 600 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 601 602 if (lpReserved != NULL) { 603 // lpReserved is used for telling the difference: 604 // lpReserved == NULL when FreeLibrary() was called, 605 // lpReserved != NULL when the process terminates. 606 // When FreeLibrary() is called, worker threads remain alive. So they will 607 // release the forkjoin lock by themselves. When the process terminates, 608 // worker threads disappear triggering the problem of unreleased forkjoin 609 // lock as described below. 610 611 // A worker thread can take the forkjoin lock. The problem comes up if 612 // that worker thread becomes dead before it releases the forkjoin lock. 613 // The forkjoin lock remains taken, while the thread executing 614 // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try 615 // to take the forkjoin lock and will always fail, so that the application 616 // will never finish [normally]. This scenario is possible if 617 // __kmpc_end() has not been executed. It looks like it's not a corner 618 // case, but common cases: 619 // - the main function was compiled by an alternative compiler; 620 // - the main function was compiled by icl but without /Qopenmp 621 // (application with plugins); 622 // - application terminates by calling C exit(), Fortran CALL EXIT() or 623 // Fortran STOP. 624 // - alive foreign thread prevented __kmpc_end from doing cleanup. 625 // 626 // This is a hack to work around the problem. 627 // TODO: !!! figure out something better. 628 __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific()); 629 } 630 631 __kmp_internal_end_library(__kmp_gtid_get_specific()); 632 633 return TRUE; 634 635 case DLL_THREAD_ATTACH: 636 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 637 638 /* if we want to register new siblings all the time here call 639 * __kmp_get_gtid(); */ 640 return TRUE; 641 642 case DLL_THREAD_DETACH: 643 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 644 645 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 646 return TRUE; 647 } 648 649 return TRUE; 650 } 651 652 #endif /* KMP_OS_WINDOWS */ 653 #endif /* KMP_DYNAMIC_LIB */ 654 655 /* __kmp_parallel_deo -- Wait until it's our turn. */ 656 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 657 int gtid = *gtid_ref; 658 #ifdef BUILD_PARALLEL_ORDERED 659 kmp_team_t *team = __kmp_team_from_gtid(gtid); 660 #endif /* BUILD_PARALLEL_ORDERED */ 661 662 if (__kmp_env_consistency_check) { 663 if (__kmp_threads[gtid]->th.th_root->r.r_active) 664 #if KMP_USE_DYNAMIC_LOCK 665 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 666 #else 667 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 668 #endif 669 } 670 #ifdef BUILD_PARALLEL_ORDERED 671 if (!team->t.t_serialized) { 672 KMP_MB(); 673 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 674 NULL); 675 KMP_MB(); 676 } 677 #endif /* BUILD_PARALLEL_ORDERED */ 678 } 679 680 /* __kmp_parallel_dxo -- Signal the next task. */ 681 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 682 int gtid = *gtid_ref; 683 #ifdef BUILD_PARALLEL_ORDERED 684 int tid = __kmp_tid_from_gtid(gtid); 685 kmp_team_t *team = __kmp_team_from_gtid(gtid); 686 #endif /* BUILD_PARALLEL_ORDERED */ 687 688 if (__kmp_env_consistency_check) { 689 if (__kmp_threads[gtid]->th.th_root->r.r_active) 690 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 691 } 692 #ifdef BUILD_PARALLEL_ORDERED 693 if (!team->t.t_serialized) { 694 KMP_MB(); /* Flush all pending memory write invalidates. */ 695 696 /* use the tid of the next thread in this team */ 697 /* TODO replace with general release procedure */ 698 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 699 700 KMP_MB(); /* Flush all pending memory write invalidates. */ 701 } 702 #endif /* BUILD_PARALLEL_ORDERED */ 703 } 704 705 /* ------------------------------------------------------------------------ */ 706 /* The BARRIER for a SINGLE process section is always explicit */ 707 708 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 709 int status; 710 kmp_info_t *th; 711 kmp_team_t *team; 712 713 if (!TCR_4(__kmp_init_parallel)) 714 __kmp_parallel_initialize(); 715 __kmp_resume_if_soft_paused(); 716 717 th = __kmp_threads[gtid]; 718 team = th->th.th_team; 719 status = 0; 720 721 th->th.th_ident = id_ref; 722 723 if (team->t.t_serialized) { 724 status = 1; 725 } else { 726 kmp_int32 old_this = th->th.th_local.this_construct; 727 728 ++th->th.th_local.this_construct; 729 /* try to set team count to thread count--success means thread got the 730 single block */ 731 /* TODO: Should this be acquire or release? */ 732 if (team->t.t_construct == old_this) { 733 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 734 th->th.th_local.this_construct); 735 } 736 #if USE_ITT_BUILD 737 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 738 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 739 team->t.t_active_level == 740 1) { // Only report metadata by master of active team at level 1 741 __kmp_itt_metadata_single(id_ref); 742 } 743 #endif /* USE_ITT_BUILD */ 744 } 745 746 if (__kmp_env_consistency_check) { 747 if (status && push_ws) { 748 __kmp_push_workshare(gtid, ct_psingle, id_ref); 749 } else { 750 __kmp_check_workshare(gtid, ct_psingle, id_ref); 751 } 752 } 753 #if USE_ITT_BUILD 754 if (status) { 755 __kmp_itt_single_start(gtid); 756 } 757 #endif /* USE_ITT_BUILD */ 758 return status; 759 } 760 761 void __kmp_exit_single(int gtid) { 762 #if USE_ITT_BUILD 763 __kmp_itt_single_end(gtid); 764 #endif /* USE_ITT_BUILD */ 765 if (__kmp_env_consistency_check) 766 __kmp_pop_workshare(gtid, ct_psingle, NULL); 767 } 768 769 /* determine if we can go parallel or must use a serialized parallel region and 770 * how many threads we can use 771 * set_nproc is the number of threads requested for the team 772 * returns 0 if we should serialize or only use one thread, 773 * otherwise the number of threads to use 774 * The forkjoin lock is held by the caller. */ 775 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 776 int master_tid, int set_nthreads, 777 int enter_teams) { 778 int capacity; 779 int new_nthreads; 780 KMP_DEBUG_ASSERT(__kmp_init_serial); 781 KMP_DEBUG_ASSERT(root && parent_team); 782 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 783 784 // If dyn-var is set, dynamically adjust the number of desired threads, 785 // according to the method specified by dynamic_mode. 786 new_nthreads = set_nthreads; 787 if (!get__dynamic_2(parent_team, master_tid)) { 788 ; 789 } 790 #ifdef USE_LOAD_BALANCE 791 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 792 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 793 if (new_nthreads == 1) { 794 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 795 "reservation to 1 thread\n", 796 master_tid)); 797 return 1; 798 } 799 if (new_nthreads < set_nthreads) { 800 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 801 "reservation to %d threads\n", 802 master_tid, new_nthreads)); 803 } 804 } 805 #endif /* USE_LOAD_BALANCE */ 806 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 807 new_nthreads = __kmp_avail_proc - __kmp_nth + 808 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 809 if (new_nthreads <= 1) { 810 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 811 "reservation to 1 thread\n", 812 master_tid)); 813 return 1; 814 } 815 if (new_nthreads < set_nthreads) { 816 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 817 "reservation to %d threads\n", 818 master_tid, new_nthreads)); 819 } else { 820 new_nthreads = set_nthreads; 821 } 822 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 823 if (set_nthreads > 2) { 824 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 825 new_nthreads = (new_nthreads % set_nthreads) + 1; 826 if (new_nthreads == 1) { 827 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 828 "reservation to 1 thread\n", 829 master_tid)); 830 return 1; 831 } 832 if (new_nthreads < set_nthreads) { 833 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 834 "reservation to %d threads\n", 835 master_tid, new_nthreads)); 836 } 837 } 838 } else { 839 KMP_ASSERT(0); 840 } 841 842 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 843 if (__kmp_nth + new_nthreads - 844 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 845 __kmp_max_nth) { 846 int tl_nthreads = __kmp_max_nth - __kmp_nth + 847 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 848 if (tl_nthreads <= 0) { 849 tl_nthreads = 1; 850 } 851 852 // If dyn-var is false, emit a 1-time warning. 853 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 854 __kmp_reserve_warn = 1; 855 __kmp_msg(kmp_ms_warning, 856 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 857 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 858 } 859 if (tl_nthreads == 1) { 860 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 861 "reduced reservation to 1 thread\n", 862 master_tid)); 863 return 1; 864 } 865 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 866 "reservation to %d threads\n", 867 master_tid, tl_nthreads)); 868 new_nthreads = tl_nthreads; 869 } 870 871 // Respect OMP_THREAD_LIMIT 872 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 873 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 874 if (cg_nthreads + new_nthreads - 875 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 876 max_cg_threads) { 877 int tl_nthreads = max_cg_threads - cg_nthreads + 878 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 879 if (tl_nthreads <= 0) { 880 tl_nthreads = 1; 881 } 882 883 // If dyn-var is false, emit a 1-time warning. 884 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 885 __kmp_reserve_warn = 1; 886 __kmp_msg(kmp_ms_warning, 887 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 888 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 889 } 890 if (tl_nthreads == 1) { 891 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 892 "reduced reservation to 1 thread\n", 893 master_tid)); 894 return 1; 895 } 896 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 897 "reservation to %d threads\n", 898 master_tid, tl_nthreads)); 899 new_nthreads = tl_nthreads; 900 } 901 902 // Check if the threads array is large enough, or needs expanding. 903 // See comment in __kmp_register_root() about the adjustment if 904 // __kmp_threads[0] == NULL. 905 capacity = __kmp_threads_capacity; 906 if (TCR_PTR(__kmp_threads[0]) == NULL) { 907 --capacity; 908 } 909 if (__kmp_nth + new_nthreads - 910 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 911 capacity) { 912 // Expand the threads array. 913 int slotsRequired = __kmp_nth + new_nthreads - 914 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 915 capacity; 916 int slotsAdded = __kmp_expand_threads(slotsRequired); 917 if (slotsAdded < slotsRequired) { 918 // The threads array was not expanded enough. 919 new_nthreads -= (slotsRequired - slotsAdded); 920 KMP_ASSERT(new_nthreads >= 1); 921 922 // If dyn-var is false, emit a 1-time warning. 923 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 924 __kmp_reserve_warn = 1; 925 if (__kmp_tp_cached) { 926 __kmp_msg(kmp_ms_warning, 927 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 928 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 929 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 930 } else { 931 __kmp_msg(kmp_ms_warning, 932 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 933 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 934 } 935 } 936 } 937 } 938 939 #ifdef KMP_DEBUG 940 if (new_nthreads == 1) { 941 KC_TRACE(10, 942 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 943 "dead roots and rechecking; requested %d threads\n", 944 __kmp_get_gtid(), set_nthreads)); 945 } else { 946 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 947 " %d threads\n", 948 __kmp_get_gtid(), new_nthreads, set_nthreads)); 949 } 950 #endif // KMP_DEBUG 951 return new_nthreads; 952 } 953 954 /* Allocate threads from the thread pool and assign them to the new team. We are 955 assured that there are enough threads available, because we checked on that 956 earlier within critical section forkjoin */ 957 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 958 kmp_info_t *master_th, int master_gtid) { 959 int i; 960 int use_hot_team; 961 962 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 963 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 964 KMP_MB(); 965 966 /* first, let's setup the master thread */ 967 master_th->th.th_info.ds.ds_tid = 0; 968 master_th->th.th_team = team; 969 master_th->th.th_team_nproc = team->t.t_nproc; 970 master_th->th.th_team_master = master_th; 971 master_th->th.th_team_serialized = FALSE; 972 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 973 974 /* make sure we are not the optimized hot team */ 975 #if KMP_NESTED_HOT_TEAMS 976 use_hot_team = 0; 977 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 978 if (hot_teams) { // hot teams array is not allocated if 979 // KMP_HOT_TEAMS_MAX_LEVEL=0 980 int level = team->t.t_active_level - 1; // index in array of hot teams 981 if (master_th->th.th_teams_microtask) { // are we inside the teams? 982 if (master_th->th.th_teams_size.nteams > 1) { 983 ++level; // level was not increased in teams construct for 984 // team_of_masters 985 } 986 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 987 master_th->th.th_teams_level == team->t.t_level) { 988 ++level; // level was not increased in teams construct for 989 // team_of_workers before the parallel 990 } // team->t.t_level will be increased inside parallel 991 } 992 if (level < __kmp_hot_teams_max_level) { 993 if (hot_teams[level].hot_team) { 994 // hot team has already been allocated for given level 995 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 996 use_hot_team = 1; // the team is ready to use 997 } else { 998 use_hot_team = 0; // AC: threads are not allocated yet 999 hot_teams[level].hot_team = team; // remember new hot team 1000 hot_teams[level].hot_team_nth = team->t.t_nproc; 1001 } 1002 } else { 1003 use_hot_team = 0; 1004 } 1005 } 1006 #else 1007 use_hot_team = team == root->r.r_hot_team; 1008 #endif 1009 if (!use_hot_team) { 1010 1011 /* install the master thread */ 1012 team->t.t_threads[0] = master_th; 1013 __kmp_initialize_info(master_th, team, 0, master_gtid); 1014 1015 /* now, install the worker threads */ 1016 for (i = 1; i < team->t.t_nproc; i++) { 1017 1018 /* fork or reallocate a new thread and install it in team */ 1019 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 1020 team->t.t_threads[i] = thr; 1021 KMP_DEBUG_ASSERT(thr); 1022 KMP_DEBUG_ASSERT(thr->th.th_team == team); 1023 /* align team and thread arrived states */ 1024 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 1025 "T#%d(%d:%d) join =%llu, plain=%llu\n", 1026 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 1027 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 1028 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 1029 team->t.t_bar[bs_plain_barrier].b_arrived)); 1030 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 1031 thr->th.th_teams_level = master_th->th.th_teams_level; 1032 thr->th.th_teams_size = master_th->th.th_teams_size; 1033 { // Initialize threads' barrier data. 1034 int b; 1035 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 1036 for (b = 0; b < bs_last_barrier; ++b) { 1037 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 1038 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 1039 #if USE_DEBUGGER 1040 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 1041 #endif 1042 } 1043 } 1044 } 1045 1046 #if KMP_AFFINITY_SUPPORTED 1047 __kmp_partition_places(team); 1048 #endif 1049 } 1050 1051 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1052 for (i = 0; i < team->t.t_nproc; i++) { 1053 kmp_info_t *thr = team->t.t_threads[i]; 1054 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1055 thr->th.th_prev_level != team->t.t_level) { 1056 team->t.t_display_affinity = 1; 1057 break; 1058 } 1059 } 1060 } 1061 1062 KMP_MB(); 1063 } 1064 1065 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1066 // Propagate any changes to the floating point control registers out to the team 1067 // We try to avoid unnecessary writes to the relevant cache line in the team 1068 // structure, so we don't make changes unless they are needed. 1069 inline static void propagateFPControl(kmp_team_t *team) { 1070 if (__kmp_inherit_fp_control) { 1071 kmp_int16 x87_fpu_control_word; 1072 kmp_uint32 mxcsr; 1073 1074 // Get master values of FPU control flags (both X87 and vector) 1075 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1076 __kmp_store_mxcsr(&mxcsr); 1077 mxcsr &= KMP_X86_MXCSR_MASK; 1078 1079 // There is no point looking at t_fp_control_saved here. 1080 // If it is TRUE, we still have to update the values if they are different 1081 // from those we now have. If it is FALSE we didn't save anything yet, but 1082 // our objective is the same. We have to ensure that the values in the team 1083 // are the same as those we have. 1084 // So, this code achieves what we need whether or not t_fp_control_saved is 1085 // true. By checking whether the value needs updating we avoid unnecessary 1086 // writes that would put the cache-line into a written state, causing all 1087 // threads in the team to have to read it again. 1088 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1089 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1090 // Although we don't use this value, other code in the runtime wants to know 1091 // whether it should restore them. So we must ensure it is correct. 1092 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1093 } else { 1094 // Similarly here. Don't write to this cache-line in the team structure 1095 // unless we have to. 1096 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1097 } 1098 } 1099 1100 // Do the opposite, setting the hardware registers to the updated values from 1101 // the team. 1102 inline static void updateHWFPControl(kmp_team_t *team) { 1103 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1104 // Only reset the fp control regs if they have been changed in the team. 1105 // the parallel region that we are exiting. 1106 kmp_int16 x87_fpu_control_word; 1107 kmp_uint32 mxcsr; 1108 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1109 __kmp_store_mxcsr(&mxcsr); 1110 mxcsr &= KMP_X86_MXCSR_MASK; 1111 1112 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1113 __kmp_clear_x87_fpu_status_word(); 1114 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1115 } 1116 1117 if (team->t.t_mxcsr != mxcsr) { 1118 __kmp_load_mxcsr(&team->t.t_mxcsr); 1119 } 1120 } 1121 } 1122 #else 1123 #define propagateFPControl(x) ((void)0) 1124 #define updateHWFPControl(x) ((void)0) 1125 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1126 1127 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1128 int realloc); // forward declaration 1129 1130 /* Run a parallel region that has been serialized, so runs only in a team of the 1131 single master thread. */ 1132 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1133 kmp_info_t *this_thr; 1134 kmp_team_t *serial_team; 1135 1136 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1137 1138 /* Skip all this code for autopar serialized loops since it results in 1139 unacceptable overhead */ 1140 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1141 return; 1142 1143 if (!TCR_4(__kmp_init_parallel)) 1144 __kmp_parallel_initialize(); 1145 __kmp_resume_if_soft_paused(); 1146 1147 this_thr = __kmp_threads[global_tid]; 1148 serial_team = this_thr->th.th_serial_team; 1149 1150 /* utilize the serialized team held by this thread */ 1151 KMP_DEBUG_ASSERT(serial_team); 1152 KMP_MB(); 1153 1154 if (__kmp_tasking_mode != tskm_immediate_exec) { 1155 KMP_DEBUG_ASSERT( 1156 this_thr->th.th_task_team == 1157 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1158 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1159 NULL); 1160 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1161 "team %p, new task_team = NULL\n", 1162 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1163 this_thr->th.th_task_team = NULL; 1164 } 1165 1166 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1167 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1168 proc_bind = proc_bind_false; 1169 } else if (proc_bind == proc_bind_default) { 1170 // No proc_bind clause was specified, so use the current value 1171 // of proc-bind-var for this parallel region. 1172 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1173 } 1174 // Reset for next parallel region 1175 this_thr->th.th_set_proc_bind = proc_bind_default; 1176 1177 #if OMPT_SUPPORT 1178 ompt_data_t ompt_parallel_data = ompt_data_none; 1179 ompt_data_t *implicit_task_data; 1180 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1181 if (ompt_enabled.enabled && 1182 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1183 1184 ompt_task_info_t *parent_task_info; 1185 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1186 1187 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1188 if (ompt_enabled.ompt_callback_parallel_begin) { 1189 int team_size = 1; 1190 1191 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1192 &(parent_task_info->task_data), &(parent_task_info->frame), 1193 &ompt_parallel_data, team_size, ompt_parallel_invoker_program, 1194 codeptr); 1195 } 1196 } 1197 #endif // OMPT_SUPPORT 1198 1199 if (this_thr->th.th_team != serial_team) { 1200 // Nested level will be an index in the nested nthreads array 1201 int level = this_thr->th.th_team->t.t_level; 1202 1203 if (serial_team->t.t_serialized) { 1204 /* this serial team was already used 1205 TODO increase performance by making this locks more specific */ 1206 kmp_team_t *new_team; 1207 1208 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1209 1210 new_team = 1211 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1212 #if OMPT_SUPPORT 1213 ompt_parallel_data, 1214 #endif 1215 proc_bind, &this_thr->th.th_current_task->td_icvs, 1216 0 USE_NESTED_HOT_ARG(NULL)); 1217 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1218 KMP_ASSERT(new_team); 1219 1220 /* setup new serialized team and install it */ 1221 new_team->t.t_threads[0] = this_thr; 1222 new_team->t.t_parent = this_thr->th.th_team; 1223 serial_team = new_team; 1224 this_thr->th.th_serial_team = serial_team; 1225 1226 KF_TRACE( 1227 10, 1228 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1229 global_tid, serial_team)); 1230 1231 /* TODO the above breaks the requirement that if we run out of resources, 1232 then we can still guarantee that serialized teams are ok, since we may 1233 need to allocate a new one */ 1234 } else { 1235 KF_TRACE( 1236 10, 1237 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1238 global_tid, serial_team)); 1239 } 1240 1241 /* we have to initialize this serial team */ 1242 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1243 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1244 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1245 serial_team->t.t_ident = loc; 1246 serial_team->t.t_serialized = 1; 1247 serial_team->t.t_nproc = 1; 1248 serial_team->t.t_parent = this_thr->th.th_team; 1249 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1250 this_thr->th.th_team = serial_team; 1251 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1252 1253 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, 1254 this_thr->th.th_current_task)); 1255 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1256 this_thr->th.th_current_task->td_flags.executing = 0; 1257 1258 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1259 1260 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1261 implicit task for each serialized task represented by 1262 team->t.t_serialized? */ 1263 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1264 &this_thr->th.th_current_task->td_parent->td_icvs); 1265 1266 // Thread value exists in the nested nthreads array for the next nested 1267 // level 1268 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1269 this_thr->th.th_current_task->td_icvs.nproc = 1270 __kmp_nested_nth.nth[level + 1]; 1271 } 1272 1273 if (__kmp_nested_proc_bind.used && 1274 (level + 1 < __kmp_nested_proc_bind.used)) { 1275 this_thr->th.th_current_task->td_icvs.proc_bind = 1276 __kmp_nested_proc_bind.bind_types[level + 1]; 1277 } 1278 1279 #if USE_DEBUGGER 1280 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1281 #endif 1282 this_thr->th.th_info.ds.ds_tid = 0; 1283 1284 /* set thread cache values */ 1285 this_thr->th.th_team_nproc = 1; 1286 this_thr->th.th_team_master = this_thr; 1287 this_thr->th.th_team_serialized = 1; 1288 1289 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1290 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1291 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1292 1293 propagateFPControl(serial_team); 1294 1295 /* check if we need to allocate dispatch buffers stack */ 1296 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1297 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1298 serial_team->t.t_dispatch->th_disp_buffer = 1299 (dispatch_private_info_t *)__kmp_allocate( 1300 sizeof(dispatch_private_info_t)); 1301 } 1302 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1303 1304 KMP_MB(); 1305 1306 } else { 1307 /* this serialized team is already being used, 1308 * that's fine, just add another nested level */ 1309 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1310 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1311 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1312 ++serial_team->t.t_serialized; 1313 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1314 1315 // Nested level will be an index in the nested nthreads array 1316 int level = this_thr->th.th_team->t.t_level; 1317 // Thread value exists in the nested nthreads array for the next nested 1318 // level 1319 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1320 this_thr->th.th_current_task->td_icvs.nproc = 1321 __kmp_nested_nth.nth[level + 1]; 1322 } 1323 serial_team->t.t_level++; 1324 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1325 "of serial team %p to %d\n", 1326 global_tid, serial_team, serial_team->t.t_level)); 1327 1328 /* allocate/push dispatch buffers stack */ 1329 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1330 { 1331 dispatch_private_info_t *disp_buffer = 1332 (dispatch_private_info_t *)__kmp_allocate( 1333 sizeof(dispatch_private_info_t)); 1334 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1335 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1336 } 1337 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1338 1339 KMP_MB(); 1340 } 1341 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1342 1343 // Perform the display affinity functionality for 1344 // serialized parallel regions 1345 if (__kmp_display_affinity) { 1346 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1347 this_thr->th.th_prev_num_threads != 1) { 1348 // NULL means use the affinity-format-var ICV 1349 __kmp_aux_display_affinity(global_tid, NULL); 1350 this_thr->th.th_prev_level = serial_team->t.t_level; 1351 this_thr->th.th_prev_num_threads = 1; 1352 } 1353 } 1354 1355 if (__kmp_env_consistency_check) 1356 __kmp_push_parallel(global_tid, NULL); 1357 #if OMPT_SUPPORT 1358 serial_team->t.ompt_team_info.master_return_address = codeptr; 1359 if (ompt_enabled.enabled && 1360 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1361 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1362 1363 ompt_lw_taskteam_t lw_taskteam; 1364 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1365 &ompt_parallel_data, codeptr); 1366 1367 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1368 // don't use lw_taskteam after linking. content was swaped 1369 1370 /* OMPT implicit task begin */ 1371 implicit_task_data = OMPT_CUR_TASK_DATA(this_thr); 1372 if (ompt_enabled.ompt_callback_implicit_task) { 1373 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1374 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1375 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1376 OMPT_CUR_TASK_INFO(this_thr) 1377 ->thread_num = __kmp_tid_from_gtid(global_tid); 1378 } 1379 1380 /* OMPT state */ 1381 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1382 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1383 } 1384 #endif 1385 } 1386 1387 /* most of the work for a fork */ 1388 /* return true if we really went parallel, false if serialized */ 1389 int __kmp_fork_call(ident_t *loc, int gtid, 1390 enum fork_context_e call_context, // Intel, GNU, ... 1391 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1392 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 1393 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 1394 va_list *ap 1395 #else 1396 va_list ap 1397 #endif 1398 ) { 1399 void **argv; 1400 int i; 1401 int master_tid; 1402 int master_this_cons; 1403 kmp_team_t *team; 1404 kmp_team_t *parent_team; 1405 kmp_info_t *master_th; 1406 kmp_root_t *root; 1407 int nthreads; 1408 int master_active; 1409 int master_set_numthreads; 1410 int level; 1411 int active_level; 1412 int teams_level; 1413 #if KMP_NESTED_HOT_TEAMS 1414 kmp_hot_team_ptr_t **p_hot_teams; 1415 #endif 1416 { // KMP_TIME_BLOCK 1417 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1418 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1419 1420 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1421 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1422 /* Some systems prefer the stack for the root thread(s) to start with */ 1423 /* some gap from the parent stack to prevent false sharing. */ 1424 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1425 /* These 2 lines below are so this does not get optimized out */ 1426 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1427 __kmp_stkpadding += (short)((kmp_int64)dummy); 1428 } 1429 1430 /* initialize if needed */ 1431 KMP_DEBUG_ASSERT( 1432 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1433 if (!TCR_4(__kmp_init_parallel)) 1434 __kmp_parallel_initialize(); 1435 __kmp_resume_if_soft_paused(); 1436 1437 /* setup current data */ 1438 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1439 // shutdown 1440 parent_team = master_th->th.th_team; 1441 master_tid = master_th->th.th_info.ds.ds_tid; 1442 master_this_cons = master_th->th.th_local.this_construct; 1443 root = master_th->th.th_root; 1444 master_active = root->r.r_active; 1445 master_set_numthreads = master_th->th.th_set_nproc; 1446 1447 #if OMPT_SUPPORT 1448 ompt_data_t ompt_parallel_data = ompt_data_none; 1449 ompt_data_t *parent_task_data; 1450 ompt_frame_t *ompt_frame; 1451 ompt_data_t *implicit_task_data; 1452 void *return_address = NULL; 1453 1454 if (ompt_enabled.enabled) { 1455 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1456 NULL, NULL); 1457 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1458 } 1459 #endif 1460 1461 // Nested level will be an index in the nested nthreads array 1462 level = parent_team->t.t_level; 1463 // used to launch non-serial teams even if nested is not allowed 1464 active_level = parent_team->t.t_active_level; 1465 // needed to check nesting inside the teams 1466 teams_level = master_th->th.th_teams_level; 1467 #if KMP_NESTED_HOT_TEAMS 1468 p_hot_teams = &master_th->th.th_hot_teams; 1469 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1470 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1471 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1472 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1473 // it is either actual or not needed (when active_level > 0) 1474 (*p_hot_teams)[0].hot_team_nth = 1; 1475 } 1476 #endif 1477 1478 #if OMPT_SUPPORT 1479 if (ompt_enabled.enabled) { 1480 if (ompt_enabled.ompt_callback_parallel_begin) { 1481 int team_size = master_set_numthreads 1482 ? master_set_numthreads 1483 : get__nproc_2(parent_team, master_tid); 1484 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1485 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, 1486 OMPT_INVOKER(call_context), return_address); 1487 } 1488 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1489 } 1490 #endif 1491 1492 master_th->th.th_ident = loc; 1493 1494 if (master_th->th.th_teams_microtask && ap && 1495 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1496 // AC: This is start of parallel that is nested inside teams construct. 1497 // The team is actual (hot), all workers are ready at the fork barrier. 1498 // No lock needed to initialize the team a bit, then free workers. 1499 parent_team->t.t_ident = loc; 1500 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1501 parent_team->t.t_argc = argc; 1502 argv = (void **)parent_team->t.t_argv; 1503 for (i = argc - 1; i >= 0; --i) 1504 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 1505 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 1506 *argv++ = va_arg(*ap, void *); 1507 #else 1508 *argv++ = va_arg(ap, void *); 1509 #endif 1510 // Increment our nested depth levels, but not increase the serialization 1511 if (parent_team == master_th->th.th_serial_team) { 1512 // AC: we are in serialized parallel 1513 __kmpc_serialized_parallel(loc, gtid); 1514 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1515 // AC: need this in order enquiry functions work 1516 // correctly, will restore at join time 1517 parent_team->t.t_serialized--; 1518 #if OMPT_SUPPORT 1519 void *dummy; 1520 void **exit_runtime_p; 1521 1522 ompt_lw_taskteam_t lw_taskteam; 1523 1524 if (ompt_enabled.enabled) { 1525 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1526 &ompt_parallel_data, return_address); 1527 exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1528 1529 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1530 // don't use lw_taskteam after linking. content was swaped 1531 1532 /* OMPT implicit task begin */ 1533 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1534 if (ompt_enabled.ompt_callback_implicit_task) { 1535 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1536 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1537 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1538 OMPT_CUR_TASK_INFO(master_th) 1539 ->thread_num = __kmp_tid_from_gtid(gtid); 1540 } 1541 1542 /* OMPT state */ 1543 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1544 } else { 1545 exit_runtime_p = &dummy; 1546 } 1547 #endif 1548 1549 { 1550 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1551 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1552 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1553 #if OMPT_SUPPORT 1554 , 1555 exit_runtime_p 1556 #endif 1557 ); 1558 } 1559 1560 #if OMPT_SUPPORT 1561 *exit_runtime_p = NULL; 1562 if (ompt_enabled.enabled) { 1563 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1564 if (ompt_enabled.ompt_callback_implicit_task) { 1565 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1566 ompt_scope_end, NULL, implicit_task_data, 1, 1567 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1568 } 1569 __ompt_lw_taskteam_unlink(master_th); 1570 1571 if (ompt_enabled.ompt_callback_parallel_end) { 1572 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1573 OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th), 1574 OMPT_INVOKER(call_context), return_address); 1575 } 1576 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1577 } 1578 #endif 1579 return TRUE; 1580 } 1581 1582 parent_team->t.t_pkfn = microtask; 1583 parent_team->t.t_invoke = invoker; 1584 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1585 parent_team->t.t_active_level++; 1586 parent_team->t.t_level++; 1587 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1588 1589 /* Change number of threads in the team if requested */ 1590 if (master_set_numthreads) { // The parallel has num_threads clause 1591 if (master_set_numthreads < master_th->th.th_teams_size.nth) { 1592 // AC: only can reduce number of threads dynamically, can't increase 1593 kmp_info_t **other_threads = parent_team->t.t_threads; 1594 parent_team->t.t_nproc = master_set_numthreads; 1595 for (i = 0; i < master_set_numthreads; ++i) { 1596 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1597 } 1598 // Keep extra threads hot in the team for possible next parallels 1599 } 1600 master_th->th.th_set_nproc = 0; 1601 } 1602 1603 #if USE_DEBUGGER 1604 if (__kmp_debugging) { // Let debugger override number of threads. 1605 int nth = __kmp_omp_num_threads(loc); 1606 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1607 master_set_numthreads = nth; 1608 } 1609 } 1610 #endif 1611 1612 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1613 "master_th=%p, gtid=%d\n", 1614 root, parent_team, master_th, gtid)); 1615 __kmp_internal_fork(loc, gtid, parent_team); 1616 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1617 "master_th=%p, gtid=%d\n", 1618 root, parent_team, master_th, gtid)); 1619 1620 /* Invoke microtask for MASTER thread */ 1621 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1622 parent_team->t.t_id, parent_team->t.t_pkfn)); 1623 1624 if (!parent_team->t.t_invoke(gtid)) { 1625 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 1626 } 1627 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1628 parent_team->t.t_id, parent_team->t.t_pkfn)); 1629 KMP_MB(); /* Flush all pending memory write invalidates. */ 1630 1631 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1632 1633 return TRUE; 1634 } // Parallel closely nested in teams construct 1635 1636 #if KMP_DEBUG 1637 if (__kmp_tasking_mode != tskm_immediate_exec) { 1638 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1639 parent_team->t.t_task_team[master_th->th.th_task_state]); 1640 } 1641 #endif 1642 1643 if (parent_team->t.t_active_level >= 1644 master_th->th.th_current_task->td_icvs.max_active_levels) { 1645 nthreads = 1; 1646 } else { 1647 int enter_teams = ((ap == NULL && active_level == 0) || 1648 (ap && teams_level > 0 && teams_level == level)); 1649 nthreads = 1650 master_set_numthreads 1651 ? master_set_numthreads 1652 : get__nproc_2( 1653 parent_team, 1654 master_tid); // TODO: get nproc directly from current task 1655 1656 // Check if we need to take forkjoin lock? (no need for serialized 1657 // parallel out of teams construct). This code moved here from 1658 // __kmp_reserve_threads() to speedup nested serialized parallels. 1659 if (nthreads > 1) { 1660 if ((get__max_active_levels(master_th) == 1 && 1661 (root->r.r_in_parallel && !enter_teams)) || 1662 (__kmp_library == library_serial)) { 1663 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1664 " threads\n", 1665 gtid, nthreads)); 1666 nthreads = 1; 1667 } 1668 } 1669 if (nthreads > 1) { 1670 /* determine how many new threads we can use */ 1671 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1672 /* AC: If we execute teams from parallel region (on host), then teams 1673 should be created but each can only have 1 thread if nesting is 1674 disabled. If teams called from serial region, then teams and their 1675 threads should be created regardless of the nesting setting. */ 1676 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 1677 nthreads, enter_teams); 1678 if (nthreads == 1) { 1679 // Free lock for single thread execution here; for multi-thread 1680 // execution it will be freed later after team of threads created 1681 // and initialized 1682 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1683 } 1684 } 1685 } 1686 KMP_DEBUG_ASSERT(nthreads > 0); 1687 1688 // If we temporarily changed the set number of threads then restore it now 1689 master_th->th.th_set_nproc = 0; 1690 1691 /* create a serialized parallel region? */ 1692 if (nthreads == 1) { 1693 /* josh todo: hypothetical question: what do we do for OS X*? */ 1694 #if KMP_OS_LINUX && \ 1695 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1696 void *args[argc]; 1697 #else 1698 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1699 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1700 KMP_ARCH_AARCH64) */ 1701 1702 KA_TRACE(20, 1703 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1704 1705 __kmpc_serialized_parallel(loc, gtid); 1706 1707 if (call_context == fork_context_intel) { 1708 /* TODO this sucks, use the compiler itself to pass args! :) */ 1709 master_th->th.th_serial_team->t.t_ident = loc; 1710 if (!ap) { 1711 // revert change made in __kmpc_serialized_parallel() 1712 master_th->th.th_serial_team->t.t_level--; 1713 // Get args from parent team for teams construct 1714 1715 #if OMPT_SUPPORT 1716 void *dummy; 1717 void **exit_runtime_p; 1718 ompt_task_info_t *task_info; 1719 1720 ompt_lw_taskteam_t lw_taskteam; 1721 1722 if (ompt_enabled.enabled) { 1723 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1724 &ompt_parallel_data, return_address); 1725 1726 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1727 // don't use lw_taskteam after linking. content was swaped 1728 1729 task_info = OMPT_CUR_TASK_INFO(master_th); 1730 exit_runtime_p = &(task_info->frame.exit_frame.ptr); 1731 if (ompt_enabled.ompt_callback_implicit_task) { 1732 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1733 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1734 &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1735 OMPT_CUR_TASK_INFO(master_th) 1736 ->thread_num = __kmp_tid_from_gtid(gtid); 1737 } 1738 1739 /* OMPT state */ 1740 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1741 } else { 1742 exit_runtime_p = &dummy; 1743 } 1744 #endif 1745 1746 { 1747 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1748 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1749 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1750 parent_team->t.t_argv 1751 #if OMPT_SUPPORT 1752 , 1753 exit_runtime_p 1754 #endif 1755 ); 1756 } 1757 1758 #if OMPT_SUPPORT 1759 if (ompt_enabled.enabled) { 1760 exit_runtime_p = NULL; 1761 if (ompt_enabled.ompt_callback_implicit_task) { 1762 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1763 ompt_scope_end, NULL, &(task_info->task_data), 1, 1764 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1765 } 1766 1767 __ompt_lw_taskteam_unlink(master_th); 1768 if (ompt_enabled.ompt_callback_parallel_end) { 1769 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1770 OMPT_CUR_TEAM_DATA(master_th), parent_task_data, 1771 OMPT_INVOKER(call_context), return_address); 1772 } 1773 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1774 } 1775 #endif 1776 } else if (microtask == (microtask_t)__kmp_teams_master) { 1777 KMP_DEBUG_ASSERT(master_th->th.th_team == 1778 master_th->th.th_serial_team); 1779 team = master_th->th.th_team; 1780 // team->t.t_pkfn = microtask; 1781 team->t.t_invoke = invoker; 1782 __kmp_alloc_argv_entries(argc, team, TRUE); 1783 team->t.t_argc = argc; 1784 argv = (void **)team->t.t_argv; 1785 if (ap) { 1786 for (i = argc - 1; i >= 0; --i) 1787 // TODO: revert workaround for Intel(R) 64 tracker #96 1788 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 1789 *argv++ = va_arg(*ap, void *); 1790 #else 1791 *argv++ = va_arg(ap, void *); 1792 #endif 1793 } else { 1794 for (i = 0; i < argc; ++i) 1795 // Get args from parent team for teams construct 1796 argv[i] = parent_team->t.t_argv[i]; 1797 } 1798 // AC: revert change made in __kmpc_serialized_parallel() 1799 // because initial code in teams should have level=0 1800 team->t.t_level--; 1801 // AC: call special invoker for outer "parallel" of teams construct 1802 invoker(gtid); 1803 } else { 1804 argv = args; 1805 for (i = argc - 1; i >= 0; --i) 1806 // TODO: revert workaround for Intel(R) 64 tracker #96 1807 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 1808 *argv++ = va_arg(*ap, void *); 1809 #else 1810 *argv++ = va_arg(ap, void *); 1811 #endif 1812 KMP_MB(); 1813 1814 #if OMPT_SUPPORT 1815 void *dummy; 1816 void **exit_runtime_p; 1817 ompt_task_info_t *task_info; 1818 1819 ompt_lw_taskteam_t lw_taskteam; 1820 1821 if (ompt_enabled.enabled) { 1822 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1823 &ompt_parallel_data, return_address); 1824 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1825 // don't use lw_taskteam after linking. content was swaped 1826 task_info = OMPT_CUR_TASK_INFO(master_th); 1827 exit_runtime_p = &(task_info->frame.exit_frame.ptr); 1828 1829 /* OMPT implicit task begin */ 1830 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1831 if (ompt_enabled.ompt_callback_implicit_task) { 1832 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1833 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1834 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1835 OMPT_CUR_TASK_INFO(master_th) 1836 ->thread_num = __kmp_tid_from_gtid(gtid); 1837 } 1838 1839 /* OMPT state */ 1840 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1841 } else { 1842 exit_runtime_p = &dummy; 1843 } 1844 #endif 1845 1846 { 1847 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1848 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1849 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1850 #if OMPT_SUPPORT 1851 , 1852 exit_runtime_p 1853 #endif 1854 ); 1855 } 1856 1857 #if OMPT_SUPPORT 1858 if (ompt_enabled.enabled) { 1859 *exit_runtime_p = NULL; 1860 if (ompt_enabled.ompt_callback_implicit_task) { 1861 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1862 ompt_scope_end, NULL, &(task_info->task_data), 1, 1863 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1864 } 1865 1866 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1867 __ompt_lw_taskteam_unlink(master_th); 1868 if (ompt_enabled.ompt_callback_parallel_end) { 1869 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1870 &ompt_parallel_data, parent_task_data, 1871 OMPT_INVOKER(call_context), return_address); 1872 } 1873 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1874 } 1875 #endif 1876 } 1877 } else if (call_context == fork_context_gnu) { 1878 #if OMPT_SUPPORT 1879 ompt_lw_taskteam_t lwt; 1880 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1881 return_address); 1882 1883 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1884 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1885 // don't use lw_taskteam after linking. content was swaped 1886 #endif 1887 1888 // we were called from GNU native code 1889 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1890 return FALSE; 1891 } else { 1892 KMP_ASSERT2(call_context < fork_context_last, 1893 "__kmp_fork_call: unknown fork_context parameter"); 1894 } 1895 1896 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1897 KMP_MB(); 1898 return FALSE; 1899 } // if (nthreads == 1) 1900 1901 // GEH: only modify the executing flag in the case when not serialized 1902 // serialized case is handled in kmpc_serialized_parallel 1903 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1904 "curtask=%p, curtask_max_aclevel=%d\n", 1905 parent_team->t.t_active_level, master_th, 1906 master_th->th.th_current_task, 1907 master_th->th.th_current_task->td_icvs.max_active_levels)); 1908 // TODO: GEH - cannot do this assertion because root thread not set up as 1909 // executing 1910 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1911 master_th->th.th_current_task->td_flags.executing = 0; 1912 1913 if (!master_th->th.th_teams_microtask || level > teams_level) { 1914 /* Increment our nested depth level */ 1915 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1916 } 1917 1918 // See if we need to make a copy of the ICVs. 1919 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1920 if ((level + 1 < __kmp_nested_nth.used) && 1921 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1922 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1923 } else { 1924 nthreads_icv = 0; // don't update 1925 } 1926 1927 // Figure out the proc_bind_policy for the new team. 1928 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1929 kmp_proc_bind_t proc_bind_icv = 1930 proc_bind_default; // proc_bind_default means don't update 1931 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1932 proc_bind = proc_bind_false; 1933 } else { 1934 if (proc_bind == proc_bind_default) { 1935 // No proc_bind clause specified; use current proc-bind-var for this 1936 // parallel region 1937 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1938 } 1939 /* else: The proc_bind policy was specified explicitly on parallel clause. 1940 This overrides proc-bind-var for this parallel region, but does not 1941 change proc-bind-var. */ 1942 // Figure the value of proc-bind-var for the child threads. 1943 if ((level + 1 < __kmp_nested_proc_bind.used) && 1944 (__kmp_nested_proc_bind.bind_types[level + 1] != 1945 master_th->th.th_current_task->td_icvs.proc_bind)) { 1946 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1947 } 1948 } 1949 1950 // Reset for next parallel region 1951 master_th->th.th_set_proc_bind = proc_bind_default; 1952 1953 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 1954 kmp_internal_control_t new_icvs; 1955 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 1956 new_icvs.next = NULL; 1957 if (nthreads_icv > 0) { 1958 new_icvs.nproc = nthreads_icv; 1959 } 1960 if (proc_bind_icv != proc_bind_default) { 1961 new_icvs.proc_bind = proc_bind_icv; 1962 } 1963 1964 /* allocate a new parallel team */ 1965 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1966 team = __kmp_allocate_team(root, nthreads, nthreads, 1967 #if OMPT_SUPPORT 1968 ompt_parallel_data, 1969 #endif 1970 proc_bind, &new_icvs, 1971 argc USE_NESTED_HOT_ARG(master_th)); 1972 } else { 1973 /* allocate a new parallel team */ 1974 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1975 team = __kmp_allocate_team(root, nthreads, nthreads, 1976 #if OMPT_SUPPORT 1977 ompt_parallel_data, 1978 #endif 1979 proc_bind, 1980 &master_th->th.th_current_task->td_icvs, 1981 argc USE_NESTED_HOT_ARG(master_th)); 1982 } 1983 KF_TRACE( 1984 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 1985 1986 /* setup the new team */ 1987 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 1988 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 1989 KMP_CHECK_UPDATE(team->t.t_ident, loc); 1990 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 1991 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 1992 #if OMPT_SUPPORT 1993 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 1994 return_address); 1995 #endif 1996 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 1997 // TODO: parent_team->t.t_level == INT_MAX ??? 1998 if (!master_th->th.th_teams_microtask || level > teams_level) { 1999 int new_level = parent_team->t.t_level + 1; 2000 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2001 new_level = parent_team->t.t_active_level + 1; 2002 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2003 } else { 2004 // AC: Do not increase parallel level at start of the teams construct 2005 int new_level = parent_team->t.t_level; 2006 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2007 new_level = parent_team->t.t_active_level; 2008 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2009 } 2010 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2011 // set master's schedule as new run-time schedule 2012 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2013 2014 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2015 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2016 2017 // Update the floating point rounding in the team if required. 2018 propagateFPControl(team); 2019 2020 if (__kmp_tasking_mode != tskm_immediate_exec) { 2021 // Set master's task team to team's task team. Unless this is hot team, it 2022 // should be NULL. 2023 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2024 parent_team->t.t_task_team[master_th->th.th_task_state]); 2025 KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team " 2026 "%p, new task_team %p / team %p\n", 2027 __kmp_gtid_from_thread(master_th), 2028 master_th->th.th_task_team, parent_team, 2029 team->t.t_task_team[master_th->th.th_task_state], team)); 2030 2031 if (active_level || master_th->th.th_task_team) { 2032 // Take a memo of master's task_state 2033 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2034 if (master_th->th.th_task_state_top >= 2035 master_th->th.th_task_state_stack_sz) { // increase size 2036 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2037 kmp_uint8 *old_stack, *new_stack; 2038 kmp_uint32 i; 2039 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2040 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2041 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2042 } 2043 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2044 ++i) { // zero-init rest of stack 2045 new_stack[i] = 0; 2046 } 2047 old_stack = master_th->th.th_task_state_memo_stack; 2048 master_th->th.th_task_state_memo_stack = new_stack; 2049 master_th->th.th_task_state_stack_sz = new_size; 2050 __kmp_free(old_stack); 2051 } 2052 // Store master's task_state on stack 2053 master_th->th 2054 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2055 master_th->th.th_task_state; 2056 master_th->th.th_task_state_top++; 2057 #if KMP_NESTED_HOT_TEAMS 2058 if (master_th->th.th_hot_teams && 2059 active_level < __kmp_hot_teams_max_level && 2060 team == master_th->th.th_hot_teams[active_level].hot_team) { 2061 // Restore master's nested state if nested hot team 2062 master_th->th.th_task_state = 2063 master_th->th 2064 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2065 } else { 2066 #endif 2067 master_th->th.th_task_state = 0; 2068 #if KMP_NESTED_HOT_TEAMS 2069 } 2070 #endif 2071 } 2072 #if !KMP_NESTED_HOT_TEAMS 2073 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2074 (team == root->r.r_hot_team)); 2075 #endif 2076 } 2077 2078 KA_TRACE( 2079 20, 2080 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2081 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2082 team->t.t_nproc)); 2083 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2084 (team->t.t_master_tid == 0 && 2085 (team->t.t_parent == root->r.r_root_team || 2086 team->t.t_parent->t.t_serialized))); 2087 KMP_MB(); 2088 2089 /* now, setup the arguments */ 2090 argv = (void **)team->t.t_argv; 2091 if (ap) { 2092 for (i = argc - 1; i >= 0; --i) { 2093 // TODO: revert workaround for Intel(R) 64 tracker #96 2094 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 2095 void *new_argv = va_arg(*ap, void *); 2096 #else 2097 void *new_argv = va_arg(ap, void *); 2098 #endif 2099 KMP_CHECK_UPDATE(*argv, new_argv); 2100 argv++; 2101 } 2102 } else { 2103 for (i = 0; i < argc; ++i) { 2104 // Get args from parent team for teams construct 2105 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2106 } 2107 } 2108 2109 /* now actually fork the threads */ 2110 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2111 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2112 root->r.r_active = TRUE; 2113 2114 __kmp_fork_team_threads(root, team, master_th, gtid); 2115 __kmp_setup_icv_copy(team, nthreads, 2116 &master_th->th.th_current_task->td_icvs, loc); 2117 2118 #if OMPT_SUPPORT 2119 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2120 #endif 2121 2122 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2123 2124 #if USE_ITT_BUILD 2125 if (team->t.t_active_level == 1 // only report frames at level 1 2126 && !master_th->th.th_teams_microtask) { // not in teams construct 2127 #if USE_ITT_NOTIFY 2128 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2129 (__kmp_forkjoin_frames_mode == 3 || 2130 __kmp_forkjoin_frames_mode == 1)) { 2131 kmp_uint64 tmp_time = 0; 2132 if (__itt_get_timestamp_ptr) 2133 tmp_time = __itt_get_timestamp(); 2134 // Internal fork - report frame begin 2135 master_th->th.th_frame_time = tmp_time; 2136 if (__kmp_forkjoin_frames_mode == 3) 2137 team->t.t_region_time = tmp_time; 2138 } else 2139 // only one notification scheme (either "submit" or "forking/joined", not both) 2140 #endif /* USE_ITT_NOTIFY */ 2141 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2142 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2143 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2144 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2145 } 2146 } 2147 #endif /* USE_ITT_BUILD */ 2148 2149 /* now go on and do the work */ 2150 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2151 KMP_MB(); 2152 KF_TRACE(10, 2153 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2154 root, team, master_th, gtid)); 2155 2156 #if USE_ITT_BUILD 2157 if (__itt_stack_caller_create_ptr) { 2158 team->t.t_stack_id = 2159 __kmp_itt_stack_caller_create(); // create new stack stitching id 2160 // before entering fork barrier 2161 } 2162 #endif /* USE_ITT_BUILD */ 2163 2164 // AC: skip __kmp_internal_fork at teams construct, let only master 2165 // threads execute 2166 if (ap) { 2167 __kmp_internal_fork(loc, gtid, team); 2168 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2169 "master_th=%p, gtid=%d\n", 2170 root, team, master_th, gtid)); 2171 } 2172 2173 if (call_context == fork_context_gnu) { 2174 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2175 return TRUE; 2176 } 2177 2178 /* Invoke microtask for MASTER thread */ 2179 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2180 team->t.t_id, team->t.t_pkfn)); 2181 } // END of timer KMP_fork_call block 2182 2183 #if KMP_STATS_ENABLED 2184 // If beginning a teams construct, then change thread state 2185 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2186 if (!ap) { 2187 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2188 } 2189 #endif 2190 2191 if (!team->t.t_invoke(gtid)) { 2192 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 2193 } 2194 2195 #if KMP_STATS_ENABLED 2196 // If was beginning of a teams construct, then reset thread state 2197 if (!ap) { 2198 KMP_SET_THREAD_STATE(previous_state); 2199 } 2200 #endif 2201 2202 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2203 team->t.t_id, team->t.t_pkfn)); 2204 KMP_MB(); /* Flush all pending memory write invalidates. */ 2205 2206 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2207 2208 #if OMPT_SUPPORT 2209 if (ompt_enabled.enabled) { 2210 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2211 } 2212 #endif 2213 2214 return TRUE; 2215 } 2216 2217 #if OMPT_SUPPORT 2218 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2219 kmp_team_t *team) { 2220 // restore state outside the region 2221 thread->th.ompt_thread_info.state = 2222 ((team->t.t_serialized) ? ompt_state_work_serial 2223 : ompt_state_work_parallel); 2224 } 2225 2226 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2227 kmp_team_t *team, ompt_data_t *parallel_data, 2228 fork_context_e fork_context, void *codeptr) { 2229 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2230 if (ompt_enabled.ompt_callback_parallel_end) { 2231 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2232 parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context), 2233 codeptr); 2234 } 2235 2236 task_info->frame.enter_frame = ompt_data_none; 2237 __kmp_join_restore_state(thread, team); 2238 } 2239 #endif 2240 2241 void __kmp_join_call(ident_t *loc, int gtid 2242 #if OMPT_SUPPORT 2243 , 2244 enum fork_context_e fork_context 2245 #endif 2246 , 2247 int exit_teams) { 2248 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2249 kmp_team_t *team; 2250 kmp_team_t *parent_team; 2251 kmp_info_t *master_th; 2252 kmp_root_t *root; 2253 int master_active; 2254 2255 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2256 2257 /* setup current data */ 2258 master_th = __kmp_threads[gtid]; 2259 root = master_th->th.th_root; 2260 team = master_th->th.th_team; 2261 parent_team = team->t.t_parent; 2262 2263 master_th->th.th_ident = loc; 2264 2265 #if OMPT_SUPPORT 2266 if (ompt_enabled.enabled) { 2267 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2268 } 2269 #endif 2270 2271 #if KMP_DEBUG 2272 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2273 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2274 "th_task_team = %p\n", 2275 __kmp_gtid_from_thread(master_th), team, 2276 team->t.t_task_team[master_th->th.th_task_state], 2277 master_th->th.th_task_team)); 2278 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2279 team->t.t_task_team[master_th->th.th_task_state]); 2280 } 2281 #endif 2282 2283 if (team->t.t_serialized) { 2284 if (master_th->th.th_teams_microtask) { 2285 // We are in teams construct 2286 int level = team->t.t_level; 2287 int tlevel = master_th->th.th_teams_level; 2288 if (level == tlevel) { 2289 // AC: we haven't incremented it earlier at start of teams construct, 2290 // so do it here - at the end of teams construct 2291 team->t.t_level++; 2292 } else if (level == tlevel + 1) { 2293 // AC: we are exiting parallel inside teams, need to increment 2294 // serialization in order to restore it in the next call to 2295 // __kmpc_end_serialized_parallel 2296 team->t.t_serialized++; 2297 } 2298 } 2299 __kmpc_end_serialized_parallel(loc, gtid); 2300 2301 #if OMPT_SUPPORT 2302 if (ompt_enabled.enabled) { 2303 __kmp_join_restore_state(master_th, parent_team); 2304 } 2305 #endif 2306 2307 return; 2308 } 2309 2310 master_active = team->t.t_master_active; 2311 2312 if (!exit_teams) { 2313 // AC: No barrier for internal teams at exit from teams construct. 2314 // But there is barrier for external team (league). 2315 __kmp_internal_join(loc, gtid, team); 2316 } else { 2317 master_th->th.th_task_state = 2318 0; // AC: no tasking in teams (out of any parallel) 2319 } 2320 2321 KMP_MB(); 2322 2323 #if OMPT_SUPPORT 2324 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2325 void *codeptr = team->t.ompt_team_info.master_return_address; 2326 #endif 2327 2328 #if USE_ITT_BUILD 2329 if (__itt_stack_caller_create_ptr) { 2330 __kmp_itt_stack_caller_destroy( 2331 (__itt_caller)team->t 2332 .t_stack_id); // destroy the stack stitching id after join barrier 2333 } 2334 2335 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2336 if (team->t.t_active_level == 1 && 2337 !master_th->th.th_teams_microtask) { /* not in teams construct */ 2338 master_th->th.th_ident = loc; 2339 // only one notification scheme (either "submit" or "forking/joined", not 2340 // both) 2341 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2342 __kmp_forkjoin_frames_mode == 3) 2343 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2344 master_th->th.th_frame_time, 0, loc, 2345 master_th->th.th_team_nproc, 1); 2346 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2347 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2348 __kmp_itt_region_joined(gtid); 2349 } // active_level == 1 2350 #endif /* USE_ITT_BUILD */ 2351 2352 if (master_th->th.th_teams_microtask && !exit_teams && 2353 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2354 team->t.t_level == master_th->th.th_teams_level + 1) { 2355 // AC: We need to leave the team structure intact at the end of parallel 2356 // inside the teams construct, so that at the next parallel same (hot) team 2357 // works, only adjust nesting levels 2358 2359 /* Decrement our nested depth level */ 2360 team->t.t_level--; 2361 team->t.t_active_level--; 2362 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2363 2364 // Restore number of threads in the team if needed. This code relies on 2365 // the proper adjustment of th_teams_size.nth after the fork in 2366 // __kmp_teams_master on each teams master in the case that 2367 // __kmp_reserve_threads reduced it. 2368 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2369 int old_num = master_th->th.th_team_nproc; 2370 int new_num = master_th->th.th_teams_size.nth; 2371 kmp_info_t **other_threads = team->t.t_threads; 2372 team->t.t_nproc = new_num; 2373 for (int i = 0; i < old_num; ++i) { 2374 other_threads[i]->th.th_team_nproc = new_num; 2375 } 2376 // Adjust states of non-used threads of the team 2377 for (int i = old_num; i < new_num; ++i) { 2378 // Re-initialize thread's barrier data. 2379 KMP_DEBUG_ASSERT(other_threads[i]); 2380 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2381 for (int b = 0; b < bs_last_barrier; ++b) { 2382 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2383 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2384 #if USE_DEBUGGER 2385 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2386 #endif 2387 } 2388 if (__kmp_tasking_mode != tskm_immediate_exec) { 2389 // Synchronize thread's task state 2390 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2391 } 2392 } 2393 } 2394 2395 #if OMPT_SUPPORT 2396 if (ompt_enabled.enabled) { 2397 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context, 2398 codeptr); 2399 } 2400 #endif 2401 2402 return; 2403 } 2404 2405 /* do cleanup and restore the parent team */ 2406 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2407 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2408 2409 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2410 2411 /* jc: The following lock has instructions with REL and ACQ semantics, 2412 separating the parallel user code called in this parallel region 2413 from the serial user code called after this function returns. */ 2414 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2415 2416 if (!master_th->th.th_teams_microtask || 2417 team->t.t_level > master_th->th.th_teams_level) { 2418 /* Decrement our nested depth level */ 2419 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2420 } 2421 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2422 2423 #if OMPT_SUPPORT 2424 if (ompt_enabled.enabled) { 2425 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2426 if (ompt_enabled.ompt_callback_implicit_task) { 2427 int ompt_team_size = team->t.t_nproc; 2428 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2429 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2430 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial? 2431 } 2432 2433 task_info->frame.exit_frame = ompt_data_none; 2434 task_info->task_data = ompt_data_none; 2435 } 2436 #endif 2437 2438 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2439 master_th, team)); 2440 __kmp_pop_current_task_from_thread(master_th); 2441 2442 #if KMP_AFFINITY_SUPPORTED 2443 // Restore master thread's partition. 2444 master_th->th.th_first_place = team->t.t_first_place; 2445 master_th->th.th_last_place = team->t.t_last_place; 2446 #endif // KMP_AFFINITY_SUPPORTED 2447 master_th->th.th_def_allocator = team->t.t_def_allocator; 2448 2449 updateHWFPControl(team); 2450 2451 if (root->r.r_active != master_active) 2452 root->r.r_active = master_active; 2453 2454 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2455 master_th)); // this will free worker threads 2456 2457 /* this race was fun to find. make sure the following is in the critical 2458 region otherwise assertions may fail occasionally since the old team may be 2459 reallocated and the hierarchy appears inconsistent. it is actually safe to 2460 run and won't cause any bugs, but will cause those assertion failures. it's 2461 only one deref&assign so might as well put this in the critical region */ 2462 master_th->th.th_team = parent_team; 2463 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2464 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2465 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2466 2467 /* restore serialized team, if need be */ 2468 if (parent_team->t.t_serialized && 2469 parent_team != master_th->th.th_serial_team && 2470 parent_team != root->r.r_root_team) { 2471 __kmp_free_team(root, 2472 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2473 master_th->th.th_serial_team = parent_team; 2474 } 2475 2476 if (__kmp_tasking_mode != tskm_immediate_exec) { 2477 if (master_th->th.th_task_state_top > 2478 0) { // Restore task state from memo stack 2479 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2480 // Remember master's state if we re-use this nested hot team 2481 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2482 master_th->th.th_task_state; 2483 --master_th->th.th_task_state_top; // pop 2484 // Now restore state at this level 2485 master_th->th.th_task_state = 2486 master_th->th 2487 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2488 } 2489 // Copy the task team from the parent team to the master thread 2490 master_th->th.th_task_team = 2491 parent_team->t.t_task_team[master_th->th.th_task_state]; 2492 KA_TRACE(20, 2493 ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n", 2494 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2495 parent_team)); 2496 } 2497 2498 // TODO: GEH - cannot do this assertion because root thread not set up as 2499 // executing 2500 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2501 master_th->th.th_current_task->td_flags.executing = 1; 2502 2503 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2504 2505 #if OMPT_SUPPORT 2506 if (ompt_enabled.enabled) { 2507 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context, 2508 codeptr); 2509 } 2510 #endif 2511 2512 KMP_MB(); 2513 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2514 } 2515 2516 /* Check whether we should push an internal control record onto the 2517 serial team stack. If so, do it. */ 2518 void __kmp_save_internal_controls(kmp_info_t *thread) { 2519 2520 if (thread->th.th_team != thread->th.th_serial_team) { 2521 return; 2522 } 2523 if (thread->th.th_team->t.t_serialized > 1) { 2524 int push = 0; 2525 2526 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2527 push = 1; 2528 } else { 2529 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2530 thread->th.th_team->t.t_serialized) { 2531 push = 1; 2532 } 2533 } 2534 if (push) { /* push a record on the serial team's stack */ 2535 kmp_internal_control_t *control = 2536 (kmp_internal_control_t *)__kmp_allocate( 2537 sizeof(kmp_internal_control_t)); 2538 2539 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2540 2541 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2542 2543 control->next = thread->th.th_team->t.t_control_stack_top; 2544 thread->th.th_team->t.t_control_stack_top = control; 2545 } 2546 } 2547 } 2548 2549 /* Changes set_nproc */ 2550 void __kmp_set_num_threads(int new_nth, int gtid) { 2551 kmp_info_t *thread; 2552 kmp_root_t *root; 2553 2554 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2555 KMP_DEBUG_ASSERT(__kmp_init_serial); 2556 2557 if (new_nth < 1) 2558 new_nth = 1; 2559 else if (new_nth > __kmp_max_nth) 2560 new_nth = __kmp_max_nth; 2561 2562 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2563 thread = __kmp_threads[gtid]; 2564 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2565 return; // nothing to do 2566 2567 __kmp_save_internal_controls(thread); 2568 2569 set__nproc(thread, new_nth); 2570 2571 // If this omp_set_num_threads() call will cause the hot team size to be 2572 // reduced (in the absence of a num_threads clause), then reduce it now, 2573 // rather than waiting for the next parallel region. 2574 root = thread->th.th_root; 2575 if (__kmp_init_parallel && (!root->r.r_active) && 2576 (root->r.r_hot_team->t.t_nproc > new_nth) 2577 #if KMP_NESTED_HOT_TEAMS 2578 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2579 #endif 2580 ) { 2581 kmp_team_t *hot_team = root->r.r_hot_team; 2582 int f; 2583 2584 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2585 2586 // Release the extra threads we don't need any more. 2587 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2588 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2589 if (__kmp_tasking_mode != tskm_immediate_exec) { 2590 // When decreasing team size, threads no longer in the team should unref 2591 // task team. 2592 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2593 } 2594 __kmp_free_thread(hot_team->t.t_threads[f]); 2595 hot_team->t.t_threads[f] = NULL; 2596 } 2597 hot_team->t.t_nproc = new_nth; 2598 #if KMP_NESTED_HOT_TEAMS 2599 if (thread->th.th_hot_teams) { 2600 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2601 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2602 } 2603 #endif 2604 2605 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2606 2607 // Update the t_nproc field in the threads that are still active. 2608 for (f = 0; f < new_nth; f++) { 2609 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2610 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2611 } 2612 // Special flag in case omp_set_num_threads() call 2613 hot_team->t.t_size_changed = -1; 2614 } 2615 } 2616 2617 /* Changes max_active_levels */ 2618 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2619 kmp_info_t *thread; 2620 2621 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2622 "%d = (%d)\n", 2623 gtid, max_active_levels)); 2624 KMP_DEBUG_ASSERT(__kmp_init_serial); 2625 2626 // validate max_active_levels 2627 if (max_active_levels < 0) { 2628 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2629 // We ignore this call if the user has specified a negative value. 2630 // The current setting won't be changed. The last valid setting will be 2631 // used. A warning will be issued (if warnings are allowed as controlled by 2632 // the KMP_WARNINGS env var). 2633 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2634 "max_active_levels for thread %d = (%d)\n", 2635 gtid, max_active_levels)); 2636 return; 2637 } 2638 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2639 // it's OK, the max_active_levels is within the valid range: [ 0; 2640 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2641 // We allow a zero value. (implementation defined behavior) 2642 } else { 2643 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2644 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2645 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2646 // Current upper limit is MAX_INT. (implementation defined behavior) 2647 // If the input exceeds the upper limit, we correct the input to be the 2648 // upper limit. (implementation defined behavior) 2649 // Actually, the flow should never get here until we use MAX_INT limit. 2650 } 2651 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2652 "max_active_levels for thread %d = (%d)\n", 2653 gtid, max_active_levels)); 2654 2655 thread = __kmp_threads[gtid]; 2656 2657 __kmp_save_internal_controls(thread); 2658 2659 set__max_active_levels(thread, max_active_levels); 2660 } 2661 2662 /* Gets max_active_levels */ 2663 int __kmp_get_max_active_levels(int gtid) { 2664 kmp_info_t *thread; 2665 2666 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2667 KMP_DEBUG_ASSERT(__kmp_init_serial); 2668 2669 thread = __kmp_threads[gtid]; 2670 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2671 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2672 "curtask_maxaclevel=%d\n", 2673 gtid, thread->th.th_current_task, 2674 thread->th.th_current_task->td_icvs.max_active_levels)); 2675 return thread->th.th_current_task->td_icvs.max_active_levels; 2676 } 2677 2678 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2679 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2680 2681 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2682 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2683 kmp_info_t *thread; 2684 kmp_sched_t orig_kind; 2685 // kmp_team_t *team; 2686 2687 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2688 gtid, (int)kind, chunk)); 2689 KMP_DEBUG_ASSERT(__kmp_init_serial); 2690 2691 // Check if the kind parameter is valid, correct if needed. 2692 // Valid parameters should fit in one of two intervals - standard or extended: 2693 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2694 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2695 orig_kind = kind; 2696 kind = __kmp_sched_without_mods(kind); 2697 2698 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2699 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2700 // TODO: Hint needs attention in case we change the default schedule. 2701 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2702 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2703 __kmp_msg_null); 2704 kind = kmp_sched_default; 2705 chunk = 0; // ignore chunk value in case of bad kind 2706 } 2707 2708 thread = __kmp_threads[gtid]; 2709 2710 __kmp_save_internal_controls(thread); 2711 2712 if (kind < kmp_sched_upper_std) { 2713 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2714 // differ static chunked vs. unchunked: chunk should be invalid to 2715 // indicate unchunked schedule (which is the default) 2716 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2717 } else { 2718 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2719 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2720 } 2721 } else { 2722 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2723 // kmp_sched_lower - 2 ]; 2724 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2725 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2726 kmp_sched_lower - 2]; 2727 } 2728 __kmp_sched_apply_mods_intkind( 2729 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2730 if (kind == kmp_sched_auto || chunk < 1) { 2731 // ignore parameter chunk for schedule auto 2732 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2733 } else { 2734 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2735 } 2736 } 2737 2738 /* Gets def_sched_var ICV values */ 2739 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2740 kmp_info_t *thread; 2741 enum sched_type th_type; 2742 2743 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2744 KMP_DEBUG_ASSERT(__kmp_init_serial); 2745 2746 thread = __kmp_threads[gtid]; 2747 2748 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2749 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2750 case kmp_sch_static: 2751 case kmp_sch_static_greedy: 2752 case kmp_sch_static_balanced: 2753 *kind = kmp_sched_static; 2754 __kmp_sched_apply_mods_stdkind(kind, th_type); 2755 *chunk = 0; // chunk was not set, try to show this fact via zero value 2756 return; 2757 case kmp_sch_static_chunked: 2758 *kind = kmp_sched_static; 2759 break; 2760 case kmp_sch_dynamic_chunked: 2761 *kind = kmp_sched_dynamic; 2762 break; 2763 case kmp_sch_guided_chunked: 2764 case kmp_sch_guided_iterative_chunked: 2765 case kmp_sch_guided_analytical_chunked: 2766 *kind = kmp_sched_guided; 2767 break; 2768 case kmp_sch_auto: 2769 *kind = kmp_sched_auto; 2770 break; 2771 case kmp_sch_trapezoidal: 2772 *kind = kmp_sched_trapezoidal; 2773 break; 2774 #if KMP_STATIC_STEAL_ENABLED 2775 case kmp_sch_static_steal: 2776 *kind = kmp_sched_static_steal; 2777 break; 2778 #endif 2779 default: 2780 KMP_FATAL(UnknownSchedulingType, th_type); 2781 } 2782 2783 __kmp_sched_apply_mods_stdkind(kind, th_type); 2784 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2785 } 2786 2787 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2788 2789 int ii, dd; 2790 kmp_team_t *team; 2791 kmp_info_t *thr; 2792 2793 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2794 KMP_DEBUG_ASSERT(__kmp_init_serial); 2795 2796 // validate level 2797 if (level == 0) 2798 return 0; 2799 if (level < 0) 2800 return -1; 2801 thr = __kmp_threads[gtid]; 2802 team = thr->th.th_team; 2803 ii = team->t.t_level; 2804 if (level > ii) 2805 return -1; 2806 2807 if (thr->th.th_teams_microtask) { 2808 // AC: we are in teams region where multiple nested teams have same level 2809 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2810 if (level <= 2811 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2812 KMP_DEBUG_ASSERT(ii >= tlevel); 2813 // AC: As we need to pass by the teams league, we need to artificially 2814 // increase ii 2815 if (ii == tlevel) { 2816 ii += 2; // three teams have same level 2817 } else { 2818 ii++; // two teams have same level 2819 } 2820 } 2821 } 2822 2823 if (ii == level) 2824 return __kmp_tid_from_gtid(gtid); 2825 2826 dd = team->t.t_serialized; 2827 level++; 2828 while (ii > level) { 2829 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2830 } 2831 if ((team->t.t_serialized) && (!dd)) { 2832 team = team->t.t_parent; 2833 continue; 2834 } 2835 if (ii > level) { 2836 team = team->t.t_parent; 2837 dd = team->t.t_serialized; 2838 ii--; 2839 } 2840 } 2841 2842 return (dd > 1) ? (0) : (team->t.t_master_tid); 2843 } 2844 2845 int __kmp_get_team_size(int gtid, int level) { 2846 2847 int ii, dd; 2848 kmp_team_t *team; 2849 kmp_info_t *thr; 2850 2851 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2852 KMP_DEBUG_ASSERT(__kmp_init_serial); 2853 2854 // validate level 2855 if (level == 0) 2856 return 1; 2857 if (level < 0) 2858 return -1; 2859 thr = __kmp_threads[gtid]; 2860 team = thr->th.th_team; 2861 ii = team->t.t_level; 2862 if (level > ii) 2863 return -1; 2864 2865 if (thr->th.th_teams_microtask) { 2866 // AC: we are in teams region where multiple nested teams have same level 2867 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2868 if (level <= 2869 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2870 KMP_DEBUG_ASSERT(ii >= tlevel); 2871 // AC: As we need to pass by the teams league, we need to artificially 2872 // increase ii 2873 if (ii == tlevel) { 2874 ii += 2; // three teams have same level 2875 } else { 2876 ii++; // two teams have same level 2877 } 2878 } 2879 } 2880 2881 while (ii > level) { 2882 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2883 } 2884 if (team->t.t_serialized && (!dd)) { 2885 team = team->t.t_parent; 2886 continue; 2887 } 2888 if (ii > level) { 2889 team = team->t.t_parent; 2890 ii--; 2891 } 2892 } 2893 2894 return team->t.t_nproc; 2895 } 2896 2897 kmp_r_sched_t __kmp_get_schedule_global() { 2898 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 2899 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 2900 // independently. So one can get the updated schedule here. 2901 2902 kmp_r_sched_t r_sched; 2903 2904 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 2905 // __kmp_guided. __kmp_sched should keep original value, so that user can set 2906 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 2907 // different roots (even in OMP 2.5) 2908 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 2909 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 2910 if (s == kmp_sch_static) { 2911 // replace STATIC with more detailed schedule (balanced or greedy) 2912 r_sched.r_sched_type = __kmp_static; 2913 } else if (s == kmp_sch_guided_chunked) { 2914 // replace GUIDED with more detailed schedule (iterative or analytical) 2915 r_sched.r_sched_type = __kmp_guided; 2916 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 2917 r_sched.r_sched_type = __kmp_sched; 2918 } 2919 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 2920 2921 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 2922 // __kmp_chunk may be wrong here (if it was not ever set) 2923 r_sched.chunk = KMP_DEFAULT_CHUNK; 2924 } else { 2925 r_sched.chunk = __kmp_chunk; 2926 } 2927 2928 return r_sched; 2929 } 2930 2931 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 2932 at least argc number of *t_argv entries for the requested team. */ 2933 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 2934 2935 KMP_DEBUG_ASSERT(team); 2936 if (!realloc || argc > team->t.t_max_argc) { 2937 2938 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 2939 "current entries=%d\n", 2940 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 2941 /* if previously allocated heap space for args, free them */ 2942 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 2943 __kmp_free((void *)team->t.t_argv); 2944 2945 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 2946 /* use unused space in the cache line for arguments */ 2947 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 2948 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 2949 "argv entries\n", 2950 team->t.t_id, team->t.t_max_argc)); 2951 team->t.t_argv = &team->t.t_inline_argv[0]; 2952 if (__kmp_storage_map) { 2953 __kmp_print_storage_map_gtid( 2954 -1, &team->t.t_inline_argv[0], 2955 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 2956 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 2957 team->t.t_id); 2958 } 2959 } else { 2960 /* allocate space for arguments in the heap */ 2961 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 2962 ? KMP_MIN_MALLOC_ARGV_ENTRIES 2963 : 2 * argc; 2964 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 2965 "argv entries\n", 2966 team->t.t_id, team->t.t_max_argc)); 2967 team->t.t_argv = 2968 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 2969 if (__kmp_storage_map) { 2970 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 2971 &team->t.t_argv[team->t.t_max_argc], 2972 sizeof(void *) * team->t.t_max_argc, 2973 "team_%d.t_argv", team->t.t_id); 2974 } 2975 } 2976 } 2977 } 2978 2979 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 2980 int i; 2981 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 2982 team->t.t_threads = 2983 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 2984 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 2985 sizeof(dispatch_shared_info_t) * num_disp_buff); 2986 team->t.t_dispatch = 2987 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 2988 team->t.t_implicit_task_taskdata = 2989 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 2990 team->t.t_max_nproc = max_nth; 2991 2992 /* setup dispatch buffers */ 2993 for (i = 0; i < num_disp_buff; ++i) { 2994 team->t.t_disp_buffer[i].buffer_index = i; 2995 team->t.t_disp_buffer[i].doacross_buf_idx = i; 2996 } 2997 } 2998 2999 static void __kmp_free_team_arrays(kmp_team_t *team) { 3000 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3001 int i; 3002 for (i = 0; i < team->t.t_max_nproc; ++i) { 3003 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3004 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3005 team->t.t_dispatch[i].th_disp_buffer = NULL; 3006 } 3007 } 3008 #if KMP_USE_HIER_SCHED 3009 __kmp_dispatch_free_hierarchies(team); 3010 #endif 3011 __kmp_free(team->t.t_threads); 3012 __kmp_free(team->t.t_disp_buffer); 3013 __kmp_free(team->t.t_dispatch); 3014 __kmp_free(team->t.t_implicit_task_taskdata); 3015 team->t.t_threads = NULL; 3016 team->t.t_disp_buffer = NULL; 3017 team->t.t_dispatch = NULL; 3018 team->t.t_implicit_task_taskdata = 0; 3019 } 3020 3021 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3022 kmp_info_t **oldThreads = team->t.t_threads; 3023 3024 __kmp_free(team->t.t_disp_buffer); 3025 __kmp_free(team->t.t_dispatch); 3026 __kmp_free(team->t.t_implicit_task_taskdata); 3027 __kmp_allocate_team_arrays(team, max_nth); 3028 3029 KMP_MEMCPY(team->t.t_threads, oldThreads, 3030 team->t.t_nproc * sizeof(kmp_info_t *)); 3031 3032 __kmp_free(oldThreads); 3033 } 3034 3035 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3036 3037 kmp_r_sched_t r_sched = 3038 __kmp_get_schedule_global(); // get current state of scheduling globals 3039 3040 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3041 3042 kmp_internal_control_t g_icvs = { 3043 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3044 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3045 // adjustment of threads (per thread) 3046 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3047 // whether blocktime is explicitly set 3048 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3049 #if KMP_USE_MONITOR 3050 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3051 // intervals 3052 #endif 3053 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3054 // next parallel region (per thread) 3055 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3056 __kmp_cg_max_nth, // int thread_limit; 3057 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3058 // for max_active_levels 3059 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3060 // {sched,chunk} pair 3061 __kmp_nested_proc_bind.bind_types[0], 3062 __kmp_default_device, 3063 NULL // struct kmp_internal_control *next; 3064 }; 3065 3066 return g_icvs; 3067 } 3068 3069 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3070 3071 kmp_internal_control_t gx_icvs; 3072 gx_icvs.serial_nesting_level = 3073 0; // probably =team->t.t_serial like in save_inter_controls 3074 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3075 gx_icvs.next = NULL; 3076 3077 return gx_icvs; 3078 } 3079 3080 static void __kmp_initialize_root(kmp_root_t *root) { 3081 int f; 3082 kmp_team_t *root_team; 3083 kmp_team_t *hot_team; 3084 int hot_team_max_nth; 3085 kmp_r_sched_t r_sched = 3086 __kmp_get_schedule_global(); // get current state of scheduling globals 3087 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3088 KMP_DEBUG_ASSERT(root); 3089 KMP_ASSERT(!root->r.r_begin); 3090 3091 /* setup the root state structure */ 3092 __kmp_init_lock(&root->r.r_begin_lock); 3093 root->r.r_begin = FALSE; 3094 root->r.r_active = FALSE; 3095 root->r.r_in_parallel = 0; 3096 root->r.r_blocktime = __kmp_dflt_blocktime; 3097 3098 /* setup the root team for this task */ 3099 /* allocate the root team structure */ 3100 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3101 3102 root_team = 3103 __kmp_allocate_team(root, 3104 1, // new_nproc 3105 1, // max_nproc 3106 #if OMPT_SUPPORT 3107 ompt_data_none, // root parallel id 3108 #endif 3109 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3110 0 // argc 3111 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3112 ); 3113 #if USE_DEBUGGER 3114 // Non-NULL value should be assigned to make the debugger display the root 3115 // team. 3116 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3117 #endif 3118 3119 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3120 3121 root->r.r_root_team = root_team; 3122 root_team->t.t_control_stack_top = NULL; 3123 3124 /* initialize root team */ 3125 root_team->t.t_threads[0] = NULL; 3126 root_team->t.t_nproc = 1; 3127 root_team->t.t_serialized = 1; 3128 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3129 root_team->t.t_sched.sched = r_sched.sched; 3130 KA_TRACE( 3131 20, 3132 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3133 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3134 3135 /* setup the hot team for this task */ 3136 /* allocate the hot team structure */ 3137 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3138 3139 hot_team = 3140 __kmp_allocate_team(root, 3141 1, // new_nproc 3142 __kmp_dflt_team_nth_ub * 2, // max_nproc 3143 #if OMPT_SUPPORT 3144 ompt_data_none, // root parallel id 3145 #endif 3146 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3147 0 // argc 3148 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3149 ); 3150 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3151 3152 root->r.r_hot_team = hot_team; 3153 root_team->t.t_control_stack_top = NULL; 3154 3155 /* first-time initialization */ 3156 hot_team->t.t_parent = root_team; 3157 3158 /* initialize hot team */ 3159 hot_team_max_nth = hot_team->t.t_max_nproc; 3160 for (f = 0; f < hot_team_max_nth; ++f) { 3161 hot_team->t.t_threads[f] = NULL; 3162 } 3163 hot_team->t.t_nproc = 1; 3164 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3165 hot_team->t.t_sched.sched = r_sched.sched; 3166 hot_team->t.t_size_changed = 0; 3167 } 3168 3169 #ifdef KMP_DEBUG 3170 3171 typedef struct kmp_team_list_item { 3172 kmp_team_p const *entry; 3173 struct kmp_team_list_item *next; 3174 } kmp_team_list_item_t; 3175 typedef kmp_team_list_item_t *kmp_team_list_t; 3176 3177 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3178 kmp_team_list_t list, // List of teams. 3179 kmp_team_p const *team // Team to add. 3180 ) { 3181 3182 // List must terminate with item where both entry and next are NULL. 3183 // Team is added to the list only once. 3184 // List is sorted in ascending order by team id. 3185 // Team id is *not* a key. 3186 3187 kmp_team_list_t l; 3188 3189 KMP_DEBUG_ASSERT(list != NULL); 3190 if (team == NULL) { 3191 return; 3192 } 3193 3194 __kmp_print_structure_team_accum(list, team->t.t_parent); 3195 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3196 3197 // Search list for the team. 3198 l = list; 3199 while (l->next != NULL && l->entry != team) { 3200 l = l->next; 3201 } 3202 if (l->next != NULL) { 3203 return; // Team has been added before, exit. 3204 } 3205 3206 // Team is not found. Search list again for insertion point. 3207 l = list; 3208 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3209 l = l->next; 3210 } 3211 3212 // Insert team. 3213 { 3214 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3215 sizeof(kmp_team_list_item_t)); 3216 *item = *l; 3217 l->entry = team; 3218 l->next = item; 3219 } 3220 } 3221 3222 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3223 3224 ) { 3225 __kmp_printf("%s", title); 3226 if (team != NULL) { 3227 __kmp_printf("%2x %p\n", team->t.t_id, team); 3228 } else { 3229 __kmp_printf(" - (nil)\n"); 3230 } 3231 } 3232 3233 static void __kmp_print_structure_thread(char const *title, 3234 kmp_info_p const *thread) { 3235 __kmp_printf("%s", title); 3236 if (thread != NULL) { 3237 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3238 } else { 3239 __kmp_printf(" - (nil)\n"); 3240 } 3241 } 3242 3243 void __kmp_print_structure(void) { 3244 3245 kmp_team_list_t list; 3246 3247 // Initialize list of teams. 3248 list = 3249 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3250 list->entry = NULL; 3251 list->next = NULL; 3252 3253 __kmp_printf("\n------------------------------\nGlobal Thread " 3254 "Table\n------------------------------\n"); 3255 { 3256 int gtid; 3257 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3258 __kmp_printf("%2d", gtid); 3259 if (__kmp_threads != NULL) { 3260 __kmp_printf(" %p", __kmp_threads[gtid]); 3261 } 3262 if (__kmp_root != NULL) { 3263 __kmp_printf(" %p", __kmp_root[gtid]); 3264 } 3265 __kmp_printf("\n"); 3266 } 3267 } 3268 3269 // Print out __kmp_threads array. 3270 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3271 "----------\n"); 3272 if (__kmp_threads != NULL) { 3273 int gtid; 3274 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3275 kmp_info_t const *thread = __kmp_threads[gtid]; 3276 if (thread != NULL) { 3277 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3278 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3279 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3280 __kmp_print_structure_team(" Serial Team: ", 3281 thread->th.th_serial_team); 3282 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3283 __kmp_print_structure_thread(" Master: ", 3284 thread->th.th_team_master); 3285 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3286 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3287 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3288 __kmp_print_structure_thread(" Next in pool: ", 3289 thread->th.th_next_pool); 3290 __kmp_printf("\n"); 3291 __kmp_print_structure_team_accum(list, thread->th.th_team); 3292 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3293 } 3294 } 3295 } else { 3296 __kmp_printf("Threads array is not allocated.\n"); 3297 } 3298 3299 // Print out __kmp_root array. 3300 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3301 "--------\n"); 3302 if (__kmp_root != NULL) { 3303 int gtid; 3304 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3305 kmp_root_t const *root = __kmp_root[gtid]; 3306 if (root != NULL) { 3307 __kmp_printf("GTID %2d %p:\n", gtid, root); 3308 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3309 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3310 __kmp_print_structure_thread(" Uber Thread: ", 3311 root->r.r_uber_thread); 3312 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3313 __kmp_printf(" In Parallel: %2d\n", 3314 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3315 __kmp_printf("\n"); 3316 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3317 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3318 } 3319 } 3320 } else { 3321 __kmp_printf("Ubers array is not allocated.\n"); 3322 } 3323 3324 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3325 "--------\n"); 3326 while (list->next != NULL) { 3327 kmp_team_p const *team = list->entry; 3328 int i; 3329 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3330 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3331 __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid); 3332 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3333 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3334 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3335 for (i = 0; i < team->t.t_nproc; ++i) { 3336 __kmp_printf(" Thread %2d: ", i); 3337 __kmp_print_structure_thread("", team->t.t_threads[i]); 3338 } 3339 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3340 __kmp_printf("\n"); 3341 list = list->next; 3342 } 3343 3344 // Print out __kmp_thread_pool and __kmp_team_pool. 3345 __kmp_printf("\n------------------------------\nPools\n----------------------" 3346 "--------\n"); 3347 __kmp_print_structure_thread("Thread pool: ", 3348 CCAST(kmp_info_t *, __kmp_thread_pool)); 3349 __kmp_print_structure_team("Team pool: ", 3350 CCAST(kmp_team_t *, __kmp_team_pool)); 3351 __kmp_printf("\n"); 3352 3353 // Free team list. 3354 while (list != NULL) { 3355 kmp_team_list_item_t *item = list; 3356 list = list->next; 3357 KMP_INTERNAL_FREE(item); 3358 } 3359 } 3360 3361 #endif 3362 3363 //--------------------------------------------------------------------------- 3364 // Stuff for per-thread fast random number generator 3365 // Table of primes 3366 static const unsigned __kmp_primes[] = { 3367 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3368 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3369 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3370 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3371 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3372 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3373 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3374 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3375 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3376 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3377 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3378 3379 //--------------------------------------------------------------------------- 3380 // __kmp_get_random: Get a random number using a linear congruential method. 3381 unsigned short __kmp_get_random(kmp_info_t *thread) { 3382 unsigned x = thread->th.th_x; 3383 unsigned short r = x >> 16; 3384 3385 thread->th.th_x = x * thread->th.th_a + 1; 3386 3387 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3388 thread->th.th_info.ds.ds_tid, r)); 3389 3390 return r; 3391 } 3392 //-------------------------------------------------------- 3393 // __kmp_init_random: Initialize a random number generator 3394 void __kmp_init_random(kmp_info_t *thread) { 3395 unsigned seed = thread->th.th_info.ds.ds_tid; 3396 3397 thread->th.th_a = 3398 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3399 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3400 KA_TRACE(30, 3401 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3402 } 3403 3404 #if KMP_OS_WINDOWS 3405 /* reclaim array entries for root threads that are already dead, returns number 3406 * reclaimed */ 3407 static int __kmp_reclaim_dead_roots(void) { 3408 int i, r = 0; 3409 3410 for (i = 0; i < __kmp_threads_capacity; ++i) { 3411 if (KMP_UBER_GTID(i) && 3412 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3413 !__kmp_root[i] 3414 ->r.r_active) { // AC: reclaim only roots died in non-active state 3415 r += __kmp_unregister_root_other_thread(i); 3416 } 3417 } 3418 return r; 3419 } 3420 #endif 3421 3422 /* This function attempts to create free entries in __kmp_threads and 3423 __kmp_root, and returns the number of free entries generated. 3424 3425 For Windows* OS static library, the first mechanism used is to reclaim array 3426 entries for root threads that are already dead. 3427 3428 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3429 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3430 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3431 threadprivate cache array has been created. Synchronization with 3432 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3433 3434 After any dead root reclamation, if the clipping value allows array expansion 3435 to result in the generation of a total of nNeed free slots, the function does 3436 that expansion. If not, nothing is done beyond the possible initial root 3437 thread reclamation. 3438 3439 If any argument is negative, the behavior is undefined. */ 3440 static int __kmp_expand_threads(int nNeed) { 3441 int added = 0; 3442 int minimumRequiredCapacity; 3443 int newCapacity; 3444 kmp_info_t **newThreads; 3445 kmp_root_t **newRoot; 3446 3447 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3448 // resizing __kmp_threads does not need additional protection if foreign 3449 // threads are present 3450 3451 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3452 /* only for Windows static library */ 3453 /* reclaim array entries for root threads that are already dead */ 3454 added = __kmp_reclaim_dead_roots(); 3455 3456 if (nNeed) { 3457 nNeed -= added; 3458 if (nNeed < 0) 3459 nNeed = 0; 3460 } 3461 #endif 3462 if (nNeed <= 0) 3463 return added; 3464 3465 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3466 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3467 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3468 // > __kmp_max_nth in one of two ways: 3469 // 3470 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3471 // may not be resused by another thread, so we may need to increase 3472 // __kmp_threads_capacity to __kmp_max_nth + 1. 3473 // 3474 // 2) New foreign root(s) are encountered. We always register new foreign 3475 // roots. This may cause a smaller # of threads to be allocated at 3476 // subsequent parallel regions, but the worker threads hang around (and 3477 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3478 // 3479 // Anyway, that is the reason for moving the check to see if 3480 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3481 // instead of having it performed here. -BB 3482 3483 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3484 3485 /* compute expansion headroom to check if we can expand */ 3486 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3487 /* possible expansion too small -- give up */ 3488 return added; 3489 } 3490 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3491 3492 newCapacity = __kmp_threads_capacity; 3493 do { 3494 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3495 : __kmp_sys_max_nth; 3496 } while (newCapacity < minimumRequiredCapacity); 3497 newThreads = (kmp_info_t **)__kmp_allocate( 3498 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3499 newRoot = 3500 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3501 KMP_MEMCPY(newThreads, __kmp_threads, 3502 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3503 KMP_MEMCPY(newRoot, __kmp_root, 3504 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3505 3506 kmp_info_t **temp_threads = __kmp_threads; 3507 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3508 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3509 __kmp_free(temp_threads); 3510 added += newCapacity - __kmp_threads_capacity; 3511 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3512 3513 if (newCapacity > __kmp_tp_capacity) { 3514 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3515 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3516 __kmp_threadprivate_resize_cache(newCapacity); 3517 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3518 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3519 } 3520 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3521 } 3522 3523 return added; 3524 } 3525 3526 /* Register the current thread as a root thread and obtain our gtid. We must 3527 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3528 thread that calls from __kmp_do_serial_initialize() */ 3529 int __kmp_register_root(int initial_thread) { 3530 kmp_info_t *root_thread; 3531 kmp_root_t *root; 3532 int gtid; 3533 int capacity; 3534 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3535 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3536 KMP_MB(); 3537 3538 /* 2007-03-02: 3539 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3540 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3541 work as expected -- it may return false (that means there is at least one 3542 empty slot in __kmp_threads array), but it is possible the only free slot 3543 is #0, which is reserved for initial thread and so cannot be used for this 3544 one. Following code workarounds this bug. 3545 3546 However, right solution seems to be not reserving slot #0 for initial 3547 thread because: 3548 (1) there is no magic in slot #0, 3549 (2) we cannot detect initial thread reliably (the first thread which does 3550 serial initialization may be not a real initial thread). 3551 */ 3552 capacity = __kmp_threads_capacity; 3553 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3554 --capacity; 3555 } 3556 3557 /* see if there are too many threads */ 3558 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3559 if (__kmp_tp_cached) { 3560 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3561 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3562 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3563 } else { 3564 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3565 __kmp_msg_null); 3566 } 3567 } 3568 3569 /* find an available thread slot */ 3570 /* Don't reassign the zero slot since we need that to only be used by initial 3571 thread */ 3572 for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL; 3573 gtid++) 3574 ; 3575 KA_TRACE(1, 3576 ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3577 KMP_ASSERT(gtid < __kmp_threads_capacity); 3578 3579 /* update global accounting */ 3580 __kmp_all_nth++; 3581 TCW_4(__kmp_nth, __kmp_nth + 1); 3582 3583 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3584 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3585 if (__kmp_adjust_gtid_mode) { 3586 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3587 if (TCR_4(__kmp_gtid_mode) != 2) { 3588 TCW_4(__kmp_gtid_mode, 2); 3589 } 3590 } else { 3591 if (TCR_4(__kmp_gtid_mode) != 1) { 3592 TCW_4(__kmp_gtid_mode, 1); 3593 } 3594 } 3595 } 3596 3597 #ifdef KMP_ADJUST_BLOCKTIME 3598 /* Adjust blocktime to zero if necessary */ 3599 /* Middle initialization might not have occurred yet */ 3600 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3601 if (__kmp_nth > __kmp_avail_proc) { 3602 __kmp_zero_bt = TRUE; 3603 } 3604 } 3605 #endif /* KMP_ADJUST_BLOCKTIME */ 3606 3607 /* setup this new hierarchy */ 3608 if (!(root = __kmp_root[gtid])) { 3609 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3610 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3611 } 3612 3613 #if KMP_STATS_ENABLED 3614 // Initialize stats as soon as possible (right after gtid assignment). 3615 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3616 __kmp_stats_thread_ptr->startLife(); 3617 KMP_SET_THREAD_STATE(SERIAL_REGION); 3618 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3619 #endif 3620 __kmp_initialize_root(root); 3621 3622 /* setup new root thread structure */ 3623 if (root->r.r_uber_thread) { 3624 root_thread = root->r.r_uber_thread; 3625 } else { 3626 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3627 if (__kmp_storage_map) { 3628 __kmp_print_thread_storage_map(root_thread, gtid); 3629 } 3630 root_thread->th.th_info.ds.ds_gtid = gtid; 3631 #if OMPT_SUPPORT 3632 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3633 #endif 3634 root_thread->th.th_root = root; 3635 if (__kmp_env_consistency_check) { 3636 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3637 } 3638 #if USE_FAST_MEMORY 3639 __kmp_initialize_fast_memory(root_thread); 3640 #endif /* USE_FAST_MEMORY */ 3641 3642 #if KMP_USE_BGET 3643 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3644 __kmp_initialize_bget(root_thread); 3645 #endif 3646 __kmp_init_random(root_thread); // Initialize random number generator 3647 } 3648 3649 /* setup the serial team held in reserve by the root thread */ 3650 if (!root_thread->th.th_serial_team) { 3651 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3652 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3653 root_thread->th.th_serial_team = __kmp_allocate_team( 3654 root, 1, 1, 3655 #if OMPT_SUPPORT 3656 ompt_data_none, // root parallel id 3657 #endif 3658 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3659 } 3660 KMP_ASSERT(root_thread->th.th_serial_team); 3661 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3662 root_thread->th.th_serial_team)); 3663 3664 /* drop root_thread into place */ 3665 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3666 3667 root->r.r_root_team->t.t_threads[0] = root_thread; 3668 root->r.r_hot_team->t.t_threads[0] = root_thread; 3669 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3670 // AC: the team created in reserve, not for execution (it is unused for now). 3671 root_thread->th.th_serial_team->t.t_serialized = 0; 3672 root->r.r_uber_thread = root_thread; 3673 3674 /* initialize the thread, get it ready to go */ 3675 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3676 TCW_4(__kmp_init_gtid, TRUE); 3677 3678 /* prepare the master thread for get_gtid() */ 3679 __kmp_gtid_set_specific(gtid); 3680 3681 #if USE_ITT_BUILD 3682 __kmp_itt_thread_name(gtid); 3683 #endif /* USE_ITT_BUILD */ 3684 3685 #ifdef KMP_TDATA_GTID 3686 __kmp_gtid = gtid; 3687 #endif 3688 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3689 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3690 3691 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3692 "plain=%u\n", 3693 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3694 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3695 KMP_INIT_BARRIER_STATE)); 3696 { // Initialize barrier data. 3697 int b; 3698 for (b = 0; b < bs_last_barrier; ++b) { 3699 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3700 #if USE_DEBUGGER 3701 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3702 #endif 3703 } 3704 } 3705 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3706 KMP_INIT_BARRIER_STATE); 3707 3708 #if KMP_AFFINITY_SUPPORTED 3709 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3710 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3711 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3712 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3713 if (TCR_4(__kmp_init_middle)) { 3714 __kmp_affinity_set_init_mask(gtid, TRUE); 3715 } 3716 #endif /* KMP_AFFINITY_SUPPORTED */ 3717 root_thread->th.th_def_allocator = __kmp_def_allocator; 3718 root_thread->th.th_prev_level = 0; 3719 root_thread->th.th_prev_num_threads = 1; 3720 3721 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3722 tmp->cg_root = root_thread; 3723 tmp->cg_thread_limit = __kmp_cg_max_nth; 3724 tmp->cg_nthreads = 1; 3725 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3726 " cg_nthreads init to 1\n", 3727 root_thread, tmp)); 3728 tmp->up = NULL; 3729 root_thread->th.th_cg_roots = tmp; 3730 3731 __kmp_root_counter++; 3732 3733 #if OMPT_SUPPORT 3734 if (!initial_thread && ompt_enabled.enabled) { 3735 3736 kmp_info_t *root_thread = ompt_get_thread(); 3737 3738 ompt_set_thread_state(root_thread, ompt_state_overhead); 3739 3740 if (ompt_enabled.ompt_callback_thread_begin) { 3741 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3742 ompt_thread_initial, __ompt_get_thread_data_internal()); 3743 } 3744 ompt_data_t *task_data; 3745 ompt_data_t *parallel_data; 3746 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL); 3747 if (ompt_enabled.ompt_callback_implicit_task) { 3748 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3749 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 3750 } 3751 3752 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3753 } 3754 #endif 3755 3756 KMP_MB(); 3757 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3758 3759 return gtid; 3760 } 3761 3762 #if KMP_NESTED_HOT_TEAMS 3763 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3764 const int max_level) { 3765 int i, n, nth; 3766 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3767 if (!hot_teams || !hot_teams[level].hot_team) { 3768 return 0; 3769 } 3770 KMP_DEBUG_ASSERT(level < max_level); 3771 kmp_team_t *team = hot_teams[level].hot_team; 3772 nth = hot_teams[level].hot_team_nth; 3773 n = nth - 1; // master is not freed 3774 if (level < max_level - 1) { 3775 for (i = 0; i < nth; ++i) { 3776 kmp_info_t *th = team->t.t_threads[i]; 3777 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3778 if (i > 0 && th->th.th_hot_teams) { 3779 __kmp_free(th->th.th_hot_teams); 3780 th->th.th_hot_teams = NULL; 3781 } 3782 } 3783 } 3784 __kmp_free_team(root, team, NULL); 3785 return n; 3786 } 3787 #endif 3788 3789 // Resets a root thread and clear its root and hot teams. 3790 // Returns the number of __kmp_threads entries directly and indirectly freed. 3791 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3792 kmp_team_t *root_team = root->r.r_root_team; 3793 kmp_team_t *hot_team = root->r.r_hot_team; 3794 int n = hot_team->t.t_nproc; 3795 int i; 3796 3797 KMP_DEBUG_ASSERT(!root->r.r_active); 3798 3799 root->r.r_root_team = NULL; 3800 root->r.r_hot_team = NULL; 3801 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3802 // before call to __kmp_free_team(). 3803 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3804 #if KMP_NESTED_HOT_TEAMS 3805 if (__kmp_hot_teams_max_level > 3806 0) { // need to free nested hot teams and their threads if any 3807 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3808 kmp_info_t *th = hot_team->t.t_threads[i]; 3809 if (__kmp_hot_teams_max_level > 1) { 3810 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3811 } 3812 if (th->th.th_hot_teams) { 3813 __kmp_free(th->th.th_hot_teams); 3814 th->th.th_hot_teams = NULL; 3815 } 3816 } 3817 } 3818 #endif 3819 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3820 3821 // Before we can reap the thread, we need to make certain that all other 3822 // threads in the teams that had this root as ancestor have stopped trying to 3823 // steal tasks. 3824 if (__kmp_tasking_mode != tskm_immediate_exec) { 3825 __kmp_wait_to_unref_task_teams(); 3826 } 3827 3828 #if KMP_OS_WINDOWS 3829 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3830 KA_TRACE( 3831 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3832 "\n", 3833 (LPVOID) & (root->r.r_uber_thread->th), 3834 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3835 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3836 #endif /* KMP_OS_WINDOWS */ 3837 3838 #if OMPT_SUPPORT 3839 ompt_data_t *task_data; 3840 ompt_data_t *parallel_data; 3841 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL); 3842 if (ompt_enabled.ompt_callback_implicit_task) { 3843 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3844 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 3845 } 3846 if (ompt_enabled.ompt_callback_thread_end) { 3847 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3848 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3849 } 3850 #endif 3851 3852 TCW_4(__kmp_nth, 3853 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 3854 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 3855 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 3856 " to %d\n", 3857 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 3858 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 3859 if (i == 1) { 3860 // need to free contention group structure 3861 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 3862 root->r.r_uber_thread->th.th_cg_roots->cg_root); 3863 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 3864 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 3865 root->r.r_uber_thread->th.th_cg_roots = NULL; 3866 } 3867 __kmp_reap_thread(root->r.r_uber_thread, 1); 3868 3869 // We canot put root thread to __kmp_thread_pool, so we have to reap it istead 3870 // of freeing. 3871 root->r.r_uber_thread = NULL; 3872 /* mark root as no longer in use */ 3873 root->r.r_begin = FALSE; 3874 3875 return n; 3876 } 3877 3878 void __kmp_unregister_root_current_thread(int gtid) { 3879 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 3880 /* this lock should be ok, since unregister_root_current_thread is never 3881 called during an abort, only during a normal close. furthermore, if you 3882 have the forkjoin lock, you should never try to get the initz lock */ 3883 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3884 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 3885 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 3886 "exiting T#%d\n", 3887 gtid)); 3888 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3889 return; 3890 } 3891 kmp_root_t *root = __kmp_root[gtid]; 3892 3893 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 3894 KMP_ASSERT(KMP_UBER_GTID(gtid)); 3895 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 3896 KMP_ASSERT(root->r.r_active == FALSE); 3897 3898 KMP_MB(); 3899 3900 kmp_info_t *thread = __kmp_threads[gtid]; 3901 kmp_team_t *team = thread->th.th_team; 3902 kmp_task_team_t *task_team = thread->th.th_task_team; 3903 3904 // we need to wait for the proxy tasks before finishing the thread 3905 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 3906 #if OMPT_SUPPORT 3907 // the runtime is shutting down so we won't report any events 3908 thread->th.ompt_thread_info.state = ompt_state_undefined; 3909 #endif 3910 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 3911 } 3912 3913 __kmp_reset_root(gtid, root); 3914 3915 /* free up this thread slot */ 3916 __kmp_gtid_set_specific(KMP_GTID_DNE); 3917 #ifdef KMP_TDATA_GTID 3918 __kmp_gtid = KMP_GTID_DNE; 3919 #endif 3920 3921 KMP_MB(); 3922 KC_TRACE(10, 3923 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 3924 3925 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3926 } 3927 3928 #if KMP_OS_WINDOWS 3929 /* __kmp_forkjoin_lock must be already held 3930 Unregisters a root thread that is not the current thread. Returns the number 3931 of __kmp_threads entries freed as a result. */ 3932 static int __kmp_unregister_root_other_thread(int gtid) { 3933 kmp_root_t *root = __kmp_root[gtid]; 3934 int r; 3935 3936 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 3937 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 3938 KMP_ASSERT(KMP_UBER_GTID(gtid)); 3939 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 3940 KMP_ASSERT(root->r.r_active == FALSE); 3941 3942 r = __kmp_reset_root(gtid, root); 3943 KC_TRACE(10, 3944 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 3945 return r; 3946 } 3947 #endif 3948 3949 #if KMP_DEBUG 3950 void __kmp_task_info() { 3951 3952 kmp_int32 gtid = __kmp_entry_gtid(); 3953 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 3954 kmp_info_t *this_thr = __kmp_threads[gtid]; 3955 kmp_team_t *steam = this_thr->th.th_serial_team; 3956 kmp_team_t *team = this_thr->th.th_team; 3957 3958 __kmp_printf( 3959 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 3960 "ptask=%p\n", 3961 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 3962 team->t.t_implicit_task_taskdata[tid].td_parent); 3963 } 3964 #endif // KMP_DEBUG 3965 3966 /* TODO optimize with one big memclr, take out what isn't needed, split 3967 responsibility to workers as much as possible, and delay initialization of 3968 features as much as possible */ 3969 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 3970 int tid, int gtid) { 3971 /* this_thr->th.th_info.ds.ds_gtid is setup in 3972 kmp_allocate_thread/create_worker. 3973 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 3974 kmp_info_t *master = team->t.t_threads[0]; 3975 KMP_DEBUG_ASSERT(this_thr != NULL); 3976 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 3977 KMP_DEBUG_ASSERT(team); 3978 KMP_DEBUG_ASSERT(team->t.t_threads); 3979 KMP_DEBUG_ASSERT(team->t.t_dispatch); 3980 KMP_DEBUG_ASSERT(master); 3981 KMP_DEBUG_ASSERT(master->th.th_root); 3982 3983 KMP_MB(); 3984 3985 TCW_SYNC_PTR(this_thr->th.th_team, team); 3986 3987 this_thr->th.th_info.ds.ds_tid = tid; 3988 this_thr->th.th_set_nproc = 0; 3989 if (__kmp_tasking_mode != tskm_immediate_exec) 3990 // When tasking is possible, threads are not safe to reap until they are 3991 // done tasking; this will be set when tasking code is exited in wait 3992 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 3993 else // no tasking --> always safe to reap 3994 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 3995 this_thr->th.th_set_proc_bind = proc_bind_default; 3996 #if KMP_AFFINITY_SUPPORTED 3997 this_thr->th.th_new_place = this_thr->th.th_current_place; 3998 #endif 3999 this_thr->th.th_root = master->th.th_root; 4000 4001 /* setup the thread's cache of the team structure */ 4002 this_thr->th.th_team_nproc = team->t.t_nproc; 4003 this_thr->th.th_team_master = master; 4004 this_thr->th.th_team_serialized = team->t.t_serialized; 4005 TCW_PTR(this_thr->th.th_sleep_loc, NULL); 4006 4007 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4008 4009 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4010 tid, gtid, this_thr, this_thr->th.th_current_task)); 4011 4012 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4013 team, tid, TRUE); 4014 4015 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4016 tid, gtid, this_thr, this_thr->th.th_current_task)); 4017 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4018 // __kmp_initialize_team()? 4019 4020 /* TODO no worksharing in speculative threads */ 4021 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4022 4023 this_thr->th.th_local.this_construct = 0; 4024 4025 if (!this_thr->th.th_pri_common) { 4026 this_thr->th.th_pri_common = 4027 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4028 if (__kmp_storage_map) { 4029 __kmp_print_storage_map_gtid( 4030 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4031 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4032 } 4033 this_thr->th.th_pri_head = NULL; 4034 } 4035 4036 if (this_thr != master && // Master's CG root is initialized elsewhere 4037 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4038 // Make new thread's CG root same as master's 4039 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4040 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4041 if (tmp) { 4042 // worker changes CG, need to check if old CG should be freed 4043 int i = tmp->cg_nthreads--; 4044 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4045 " on node %p of thread %p to %d\n", 4046 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4047 if (i == 1) { 4048 __kmp_free(tmp); // last thread left CG --> free it 4049 } 4050 } 4051 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4052 // Increment new thread's CG root's counter to add the new thread 4053 this_thr->th.th_cg_roots->cg_nthreads++; 4054 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4055 " node %p of thread %p to %d\n", 4056 this_thr, this_thr->th.th_cg_roots, 4057 this_thr->th.th_cg_roots->cg_root, 4058 this_thr->th.th_cg_roots->cg_nthreads)); 4059 this_thr->th.th_current_task->td_icvs.thread_limit = 4060 this_thr->th.th_cg_roots->cg_thread_limit; 4061 } 4062 4063 /* Initialize dynamic dispatch */ 4064 { 4065 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4066 // Use team max_nproc since this will never change for the team. 4067 size_t disp_size = 4068 sizeof(dispatch_private_info_t) * 4069 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4070 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4071 team->t.t_max_nproc)); 4072 KMP_ASSERT(dispatch); 4073 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4074 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4075 4076 dispatch->th_disp_index = 0; 4077 dispatch->th_doacross_buf_idx = 0; 4078 if (!dispatch->th_disp_buffer) { 4079 dispatch->th_disp_buffer = 4080 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4081 4082 if (__kmp_storage_map) { 4083 __kmp_print_storage_map_gtid( 4084 gtid, &dispatch->th_disp_buffer[0], 4085 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4086 ? 1 4087 : __kmp_dispatch_num_buffers], 4088 disp_size, "th_%d.th_dispatch.th_disp_buffer " 4089 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4090 gtid, team->t.t_id, gtid); 4091 } 4092 } else { 4093 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4094 } 4095 4096 dispatch->th_dispatch_pr_current = 0; 4097 dispatch->th_dispatch_sh_current = 0; 4098 4099 dispatch->th_deo_fcn = 0; /* ORDERED */ 4100 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4101 } 4102 4103 this_thr->th.th_next_pool = NULL; 4104 4105 if (!this_thr->th.th_task_state_memo_stack) { 4106 size_t i; 4107 this_thr->th.th_task_state_memo_stack = 4108 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4109 this_thr->th.th_task_state_top = 0; 4110 this_thr->th.th_task_state_stack_sz = 4; 4111 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4112 ++i) // zero init the stack 4113 this_thr->th.th_task_state_memo_stack[i] = 0; 4114 } 4115 4116 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4117 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4118 4119 KMP_MB(); 4120 } 4121 4122 /* allocate a new thread for the requesting team. this is only called from 4123 within a forkjoin critical section. we will first try to get an available 4124 thread from the thread pool. if none is available, we will fork a new one 4125 assuming we are able to create a new one. this should be assured, as the 4126 caller should check on this first. */ 4127 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4128 int new_tid) { 4129 kmp_team_t *serial_team; 4130 kmp_info_t *new_thr; 4131 int new_gtid; 4132 4133 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4134 KMP_DEBUG_ASSERT(root && team); 4135 #if !KMP_NESTED_HOT_TEAMS 4136 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4137 #endif 4138 KMP_MB(); 4139 4140 /* first, try to get one from the thread pool */ 4141 if (__kmp_thread_pool) { 4142 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4143 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4144 if (new_thr == __kmp_thread_pool_insert_pt) { 4145 __kmp_thread_pool_insert_pt = NULL; 4146 } 4147 TCW_4(new_thr->th.th_in_pool, FALSE); 4148 __kmp_suspend_initialize_thread(new_thr); 4149 __kmp_lock_suspend_mx(new_thr); 4150 if (new_thr->th.th_active_in_pool == TRUE) { 4151 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4152 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4153 new_thr->th.th_active_in_pool = FALSE; 4154 } 4155 __kmp_unlock_suspend_mx(new_thr); 4156 4157 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4158 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4159 KMP_ASSERT(!new_thr->th.th_team); 4160 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4161 4162 /* setup the thread structure */ 4163 __kmp_initialize_info(new_thr, team, new_tid, 4164 new_thr->th.th_info.ds.ds_gtid); 4165 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4166 4167 TCW_4(__kmp_nth, __kmp_nth + 1); 4168 4169 new_thr->th.th_task_state = 0; 4170 new_thr->th.th_task_state_top = 0; 4171 new_thr->th.th_task_state_stack_sz = 4; 4172 4173 #ifdef KMP_ADJUST_BLOCKTIME 4174 /* Adjust blocktime back to zero if necessary */ 4175 /* Middle initialization might not have occurred yet */ 4176 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4177 if (__kmp_nth > __kmp_avail_proc) { 4178 __kmp_zero_bt = TRUE; 4179 } 4180 } 4181 #endif /* KMP_ADJUST_BLOCKTIME */ 4182 4183 #if KMP_DEBUG 4184 // If thread entered pool via __kmp_free_thread, wait_flag should != 4185 // KMP_BARRIER_PARENT_FLAG. 4186 int b; 4187 kmp_balign_t *balign = new_thr->th.th_bar; 4188 for (b = 0; b < bs_last_barrier; ++b) 4189 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4190 #endif 4191 4192 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4193 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4194 4195 KMP_MB(); 4196 return new_thr; 4197 } 4198 4199 /* no, well fork a new one */ 4200 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4201 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4202 4203 #if KMP_USE_MONITOR 4204 // If this is the first worker thread the RTL is creating, then also 4205 // launch the monitor thread. We try to do this as early as possible. 4206 if (!TCR_4(__kmp_init_monitor)) { 4207 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4208 if (!TCR_4(__kmp_init_monitor)) { 4209 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4210 TCW_4(__kmp_init_monitor, 1); 4211 __kmp_create_monitor(&__kmp_monitor); 4212 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4213 #if KMP_OS_WINDOWS 4214 // AC: wait until monitor has started. This is a fix for CQ232808. 4215 // The reason is that if the library is loaded/unloaded in a loop with 4216 // small (parallel) work in between, then there is high probability that 4217 // monitor thread started after the library shutdown. At shutdown it is 4218 // too late to cope with the problem, because when the master is in 4219 // DllMain (process detach) the monitor has no chances to start (it is 4220 // blocked), and master has no means to inform the monitor that the 4221 // library has gone, because all the memory which the monitor can access 4222 // is going to be released/reset. 4223 while (TCR_4(__kmp_init_monitor) < 2) { 4224 KMP_YIELD(TRUE); 4225 } 4226 KF_TRACE(10, ("after monitor thread has started\n")); 4227 #endif 4228 } 4229 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4230 } 4231 #endif 4232 4233 KMP_MB(); 4234 for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) { 4235 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4236 } 4237 4238 /* allocate space for it. */ 4239 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4240 4241 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4242 4243 if (__kmp_storage_map) { 4244 __kmp_print_thread_storage_map(new_thr, new_gtid); 4245 } 4246 4247 // add the reserve serialized team, initialized from the team's master thread 4248 { 4249 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4250 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4251 new_thr->th.th_serial_team = serial_team = 4252 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4253 #if OMPT_SUPPORT 4254 ompt_data_none, // root parallel id 4255 #endif 4256 proc_bind_default, &r_icvs, 4257 0 USE_NESTED_HOT_ARG(NULL)); 4258 } 4259 KMP_ASSERT(serial_team); 4260 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4261 // execution (it is unused for now). 4262 serial_team->t.t_threads[0] = new_thr; 4263 KF_TRACE(10, 4264 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4265 new_thr)); 4266 4267 /* setup the thread structures */ 4268 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4269 4270 #if USE_FAST_MEMORY 4271 __kmp_initialize_fast_memory(new_thr); 4272 #endif /* USE_FAST_MEMORY */ 4273 4274 #if KMP_USE_BGET 4275 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4276 __kmp_initialize_bget(new_thr); 4277 #endif 4278 4279 __kmp_init_random(new_thr); // Initialize random number generator 4280 4281 /* Initialize these only once when thread is grabbed for a team allocation */ 4282 KA_TRACE(20, 4283 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4284 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4285 4286 int b; 4287 kmp_balign_t *balign = new_thr->th.th_bar; 4288 for (b = 0; b < bs_last_barrier; ++b) { 4289 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4290 balign[b].bb.team = NULL; 4291 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4292 balign[b].bb.use_oncore_barrier = 0; 4293 } 4294 4295 new_thr->th.th_spin_here = FALSE; 4296 new_thr->th.th_next_waiting = 0; 4297 #if KMP_OS_UNIX 4298 new_thr->th.th_blocking = false; 4299 #endif 4300 4301 #if KMP_AFFINITY_SUPPORTED 4302 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4303 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4304 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4305 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4306 #endif 4307 new_thr->th.th_def_allocator = __kmp_def_allocator; 4308 new_thr->th.th_prev_level = 0; 4309 new_thr->th.th_prev_num_threads = 1; 4310 4311 TCW_4(new_thr->th.th_in_pool, FALSE); 4312 new_thr->th.th_active_in_pool = FALSE; 4313 TCW_4(new_thr->th.th_active, TRUE); 4314 4315 /* adjust the global counters */ 4316 __kmp_all_nth++; 4317 __kmp_nth++; 4318 4319 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4320 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4321 if (__kmp_adjust_gtid_mode) { 4322 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4323 if (TCR_4(__kmp_gtid_mode) != 2) { 4324 TCW_4(__kmp_gtid_mode, 2); 4325 } 4326 } else { 4327 if (TCR_4(__kmp_gtid_mode) != 1) { 4328 TCW_4(__kmp_gtid_mode, 1); 4329 } 4330 } 4331 } 4332 4333 #ifdef KMP_ADJUST_BLOCKTIME 4334 /* Adjust blocktime back to zero if necessary */ 4335 /* Middle initialization might not have occurred yet */ 4336 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4337 if (__kmp_nth > __kmp_avail_proc) { 4338 __kmp_zero_bt = TRUE; 4339 } 4340 } 4341 #endif /* KMP_ADJUST_BLOCKTIME */ 4342 4343 /* actually fork it and create the new worker thread */ 4344 KF_TRACE( 4345 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4346 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4347 KF_TRACE(10, 4348 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4349 4350 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4351 new_gtid)); 4352 KMP_MB(); 4353 return new_thr; 4354 } 4355 4356 /* Reinitialize team for reuse. 4357 The hot team code calls this case at every fork barrier, so EPCC barrier 4358 test are extremely sensitive to changes in it, esp. writes to the team 4359 struct, which cause a cache invalidation in all threads. 4360 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4361 static void __kmp_reinitialize_team(kmp_team_t *team, 4362 kmp_internal_control_t *new_icvs, 4363 ident_t *loc) { 4364 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4365 team->t.t_threads[0], team)); 4366 KMP_DEBUG_ASSERT(team && new_icvs); 4367 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4368 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4369 4370 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4371 // Copy ICVs to the master thread's implicit taskdata 4372 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4373 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4374 4375 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4376 team->t.t_threads[0], team)); 4377 } 4378 4379 /* Initialize the team data structure. 4380 This assumes the t_threads and t_max_nproc are already set. 4381 Also, we don't touch the arguments */ 4382 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4383 kmp_internal_control_t *new_icvs, 4384 ident_t *loc) { 4385 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4386 4387 /* verify */ 4388 KMP_DEBUG_ASSERT(team); 4389 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4390 KMP_DEBUG_ASSERT(team->t.t_threads); 4391 KMP_MB(); 4392 4393 team->t.t_master_tid = 0; /* not needed */ 4394 /* team->t.t_master_bar; not needed */ 4395 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4396 team->t.t_nproc = new_nproc; 4397 4398 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4399 team->t.t_next_pool = NULL; 4400 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4401 * up hot team */ 4402 4403 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4404 team->t.t_invoke = NULL; /* not needed */ 4405 4406 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4407 team->t.t_sched.sched = new_icvs->sched.sched; 4408 4409 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4410 team->t.t_fp_control_saved = FALSE; /* not needed */ 4411 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4412 team->t.t_mxcsr = 0; /* not needed */ 4413 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4414 4415 team->t.t_construct = 0; 4416 4417 team->t.t_ordered.dt.t_value = 0; 4418 team->t.t_master_active = FALSE; 4419 4420 #ifdef KMP_DEBUG 4421 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4422 #endif 4423 #if KMP_OS_WINDOWS 4424 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4425 #endif 4426 4427 team->t.t_control_stack_top = NULL; 4428 4429 __kmp_reinitialize_team(team, new_icvs, loc); 4430 4431 KMP_MB(); 4432 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4433 } 4434 4435 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED 4436 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4437 static void 4438 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4439 if (KMP_AFFINITY_CAPABLE()) { 4440 int status; 4441 if (old_mask != NULL) { 4442 status = __kmp_get_system_affinity(old_mask, TRUE); 4443 int error = errno; 4444 if (status != 0) { 4445 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4446 __kmp_msg_null); 4447 } 4448 } 4449 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4450 } 4451 } 4452 #endif 4453 4454 #if KMP_AFFINITY_SUPPORTED 4455 4456 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4457 // It calculats the worker + master thread's partition based upon the parent 4458 // thread's partition, and binds each worker to a thread in their partition. 4459 // The master thread's partition should already include its current binding. 4460 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4461 // Copy the master thread's place partion to the team struct 4462 kmp_info_t *master_th = team->t.t_threads[0]; 4463 KMP_DEBUG_ASSERT(master_th != NULL); 4464 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4465 int first_place = master_th->th.th_first_place; 4466 int last_place = master_th->th.th_last_place; 4467 int masters_place = master_th->th.th_current_place; 4468 team->t.t_first_place = first_place; 4469 team->t.t_last_place = last_place; 4470 4471 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4472 "bound to place %d partition = [%d,%d]\n", 4473 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4474 team->t.t_id, masters_place, first_place, last_place)); 4475 4476 switch (proc_bind) { 4477 4478 case proc_bind_default: 4479 // serial teams might have the proc_bind policy set to proc_bind_default. It 4480 // doesn't matter, as we don't rebind master thread for any proc_bind policy 4481 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4482 break; 4483 4484 case proc_bind_master: { 4485 int f; 4486 int n_th = team->t.t_nproc; 4487 for (f = 1; f < n_th; f++) { 4488 kmp_info_t *th = team->t.t_threads[f]; 4489 KMP_DEBUG_ASSERT(th != NULL); 4490 th->th.th_first_place = first_place; 4491 th->th.th_last_place = last_place; 4492 th->th.th_new_place = masters_place; 4493 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4494 team->t.t_display_affinity != 1) { 4495 team->t.t_display_affinity = 1; 4496 } 4497 4498 KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d " 4499 "partition = [%d,%d]\n", 4500 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4501 f, masters_place, first_place, last_place)); 4502 } 4503 } break; 4504 4505 case proc_bind_close: { 4506 int f; 4507 int n_th = team->t.t_nproc; 4508 int n_places; 4509 if (first_place <= last_place) { 4510 n_places = last_place - first_place + 1; 4511 } else { 4512 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4513 } 4514 if (n_th <= n_places) { 4515 int place = masters_place; 4516 for (f = 1; f < n_th; f++) { 4517 kmp_info_t *th = team->t.t_threads[f]; 4518 KMP_DEBUG_ASSERT(th != NULL); 4519 4520 if (place == last_place) { 4521 place = first_place; 4522 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4523 place = 0; 4524 } else { 4525 place++; 4526 } 4527 th->th.th_first_place = first_place; 4528 th->th.th_last_place = last_place; 4529 th->th.th_new_place = place; 4530 if (__kmp_display_affinity && place != th->th.th_current_place && 4531 team->t.t_display_affinity != 1) { 4532 team->t.t_display_affinity = 1; 4533 } 4534 4535 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4536 "partition = [%d,%d]\n", 4537 __kmp_gtid_from_thread(team->t.t_threads[f]), 4538 team->t.t_id, f, place, first_place, last_place)); 4539 } 4540 } else { 4541 int S, rem, gap, s_count; 4542 S = n_th / n_places; 4543 s_count = 0; 4544 rem = n_th - (S * n_places); 4545 gap = rem > 0 ? n_places / rem : n_places; 4546 int place = masters_place; 4547 int gap_ct = gap; 4548 for (f = 0; f < n_th; f++) { 4549 kmp_info_t *th = team->t.t_threads[f]; 4550 KMP_DEBUG_ASSERT(th != NULL); 4551 4552 th->th.th_first_place = first_place; 4553 th->th.th_last_place = last_place; 4554 th->th.th_new_place = place; 4555 if (__kmp_display_affinity && place != th->th.th_current_place && 4556 team->t.t_display_affinity != 1) { 4557 team->t.t_display_affinity = 1; 4558 } 4559 s_count++; 4560 4561 if ((s_count == S) && rem && (gap_ct == gap)) { 4562 // do nothing, add an extra thread to place on next iteration 4563 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4564 // we added an extra thread to this place; move to next place 4565 if (place == last_place) { 4566 place = first_place; 4567 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4568 place = 0; 4569 } else { 4570 place++; 4571 } 4572 s_count = 0; 4573 gap_ct = 1; 4574 rem--; 4575 } else if (s_count == S) { // place full; don't add extra 4576 if (place == last_place) { 4577 place = first_place; 4578 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4579 place = 0; 4580 } else { 4581 place++; 4582 } 4583 gap_ct++; 4584 s_count = 0; 4585 } 4586 4587 KA_TRACE(100, 4588 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4589 "partition = [%d,%d]\n", 4590 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4591 th->th.th_new_place, first_place, last_place)); 4592 } 4593 KMP_DEBUG_ASSERT(place == masters_place); 4594 } 4595 } break; 4596 4597 case proc_bind_spread: { 4598 int f; 4599 int n_th = team->t.t_nproc; 4600 int n_places; 4601 int thidx; 4602 if (first_place <= last_place) { 4603 n_places = last_place - first_place + 1; 4604 } else { 4605 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4606 } 4607 if (n_th <= n_places) { 4608 int place = -1; 4609 4610 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4611 int S = n_places / n_th; 4612 int s_count, rem, gap, gap_ct; 4613 4614 place = masters_place; 4615 rem = n_places - n_th * S; 4616 gap = rem ? n_th / rem : 1; 4617 gap_ct = gap; 4618 thidx = n_th; 4619 if (update_master_only == 1) 4620 thidx = 1; 4621 for (f = 0; f < thidx; f++) { 4622 kmp_info_t *th = team->t.t_threads[f]; 4623 KMP_DEBUG_ASSERT(th != NULL); 4624 4625 th->th.th_first_place = place; 4626 th->th.th_new_place = place; 4627 if (__kmp_display_affinity && place != th->th.th_current_place && 4628 team->t.t_display_affinity != 1) { 4629 team->t.t_display_affinity = 1; 4630 } 4631 s_count = 1; 4632 while (s_count < S) { 4633 if (place == last_place) { 4634 place = first_place; 4635 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4636 place = 0; 4637 } else { 4638 place++; 4639 } 4640 s_count++; 4641 } 4642 if (rem && (gap_ct == gap)) { 4643 if (place == last_place) { 4644 place = first_place; 4645 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4646 place = 0; 4647 } else { 4648 place++; 4649 } 4650 rem--; 4651 gap_ct = 0; 4652 } 4653 th->th.th_last_place = place; 4654 gap_ct++; 4655 4656 if (place == last_place) { 4657 place = first_place; 4658 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4659 place = 0; 4660 } else { 4661 place++; 4662 } 4663 4664 KA_TRACE(100, 4665 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4666 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4667 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4668 f, th->th.th_new_place, th->th.th_first_place, 4669 th->th.th_last_place, __kmp_affinity_num_masks)); 4670 } 4671 } else { 4672 /* Having uniform space of available computation places I can create 4673 T partitions of round(P/T) size and put threads into the first 4674 place of each partition. */ 4675 double current = static_cast<double>(masters_place); 4676 double spacing = 4677 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4678 int first, last; 4679 kmp_info_t *th; 4680 4681 thidx = n_th + 1; 4682 if (update_master_only == 1) 4683 thidx = 1; 4684 for (f = 0; f < thidx; f++) { 4685 first = static_cast<int>(current); 4686 last = static_cast<int>(current + spacing) - 1; 4687 KMP_DEBUG_ASSERT(last >= first); 4688 if (first >= n_places) { 4689 if (masters_place) { 4690 first -= n_places; 4691 last -= n_places; 4692 if (first == (masters_place + 1)) { 4693 KMP_DEBUG_ASSERT(f == n_th); 4694 first--; 4695 } 4696 if (last == masters_place) { 4697 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4698 last--; 4699 } 4700 } else { 4701 KMP_DEBUG_ASSERT(f == n_th); 4702 first = 0; 4703 last = 0; 4704 } 4705 } 4706 if (last >= n_places) { 4707 last = (n_places - 1); 4708 } 4709 place = first; 4710 current += spacing; 4711 if (f < n_th) { 4712 KMP_DEBUG_ASSERT(0 <= first); 4713 KMP_DEBUG_ASSERT(n_places > first); 4714 KMP_DEBUG_ASSERT(0 <= last); 4715 KMP_DEBUG_ASSERT(n_places > last); 4716 KMP_DEBUG_ASSERT(last_place >= first_place); 4717 th = team->t.t_threads[f]; 4718 KMP_DEBUG_ASSERT(th); 4719 th->th.th_first_place = first; 4720 th->th.th_new_place = place; 4721 th->th.th_last_place = last; 4722 if (__kmp_display_affinity && place != th->th.th_current_place && 4723 team->t.t_display_affinity != 1) { 4724 team->t.t_display_affinity = 1; 4725 } 4726 KA_TRACE(100, 4727 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4728 "partition = [%d,%d], spacing = %.4f\n", 4729 __kmp_gtid_from_thread(team->t.t_threads[f]), 4730 team->t.t_id, f, th->th.th_new_place, 4731 th->th.th_first_place, th->th.th_last_place, spacing)); 4732 } 4733 } 4734 } 4735 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4736 } else { 4737 int S, rem, gap, s_count; 4738 S = n_th / n_places; 4739 s_count = 0; 4740 rem = n_th - (S * n_places); 4741 gap = rem > 0 ? n_places / rem : n_places; 4742 int place = masters_place; 4743 int gap_ct = gap; 4744 thidx = n_th; 4745 if (update_master_only == 1) 4746 thidx = 1; 4747 for (f = 0; f < thidx; f++) { 4748 kmp_info_t *th = team->t.t_threads[f]; 4749 KMP_DEBUG_ASSERT(th != NULL); 4750 4751 th->th.th_first_place = place; 4752 th->th.th_last_place = place; 4753 th->th.th_new_place = place; 4754 if (__kmp_display_affinity && place != th->th.th_current_place && 4755 team->t.t_display_affinity != 1) { 4756 team->t.t_display_affinity = 1; 4757 } 4758 s_count++; 4759 4760 if ((s_count == S) && rem && (gap_ct == gap)) { 4761 // do nothing, add an extra thread to place on next iteration 4762 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4763 // we added an extra thread to this place; move on to next place 4764 if (place == last_place) { 4765 place = first_place; 4766 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4767 place = 0; 4768 } else { 4769 place++; 4770 } 4771 s_count = 0; 4772 gap_ct = 1; 4773 rem--; 4774 } else if (s_count == S) { // place is full; don't add extra thread 4775 if (place == last_place) { 4776 place = first_place; 4777 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4778 place = 0; 4779 } else { 4780 place++; 4781 } 4782 gap_ct++; 4783 s_count = 0; 4784 } 4785 4786 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4787 "partition = [%d,%d]\n", 4788 __kmp_gtid_from_thread(team->t.t_threads[f]), 4789 team->t.t_id, f, th->th.th_new_place, 4790 th->th.th_first_place, th->th.th_last_place)); 4791 } 4792 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4793 } 4794 } break; 4795 4796 default: 4797 break; 4798 } 4799 4800 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 4801 } 4802 4803 #endif // KMP_AFFINITY_SUPPORTED 4804 4805 /* allocate a new team data structure to use. take one off of the free pool if 4806 available */ 4807 kmp_team_t * 4808 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 4809 #if OMPT_SUPPORT 4810 ompt_data_t ompt_parallel_data, 4811 #endif 4812 kmp_proc_bind_t new_proc_bind, 4813 kmp_internal_control_t *new_icvs, 4814 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 4815 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 4816 int f; 4817 kmp_team_t *team; 4818 int use_hot_team = !root->r.r_active; 4819 int level = 0; 4820 4821 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 4822 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 4823 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 4824 KMP_MB(); 4825 4826 #if KMP_NESTED_HOT_TEAMS 4827 kmp_hot_team_ptr_t *hot_teams; 4828 if (master) { 4829 team = master->th.th_team; 4830 level = team->t.t_active_level; 4831 if (master->th.th_teams_microtask) { // in teams construct? 4832 if (master->th.th_teams_size.nteams > 1 && 4833 ( // #teams > 1 4834 team->t.t_pkfn == 4835 (microtask_t)__kmp_teams_master || // inner fork of the teams 4836 master->th.th_teams_level < 4837 team->t.t_level)) { // or nested parallel inside the teams 4838 ++level; // not increment if #teams==1, or for outer fork of the teams; 4839 // increment otherwise 4840 } 4841 } 4842 hot_teams = master->th.th_hot_teams; 4843 if (level < __kmp_hot_teams_max_level && hot_teams && 4844 hot_teams[level] 4845 .hot_team) { // hot team has already been allocated for given level 4846 use_hot_team = 1; 4847 } else { 4848 use_hot_team = 0; 4849 } 4850 } 4851 #endif 4852 // Optimization to use a "hot" team 4853 if (use_hot_team && new_nproc > 1) { 4854 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 4855 #if KMP_NESTED_HOT_TEAMS 4856 team = hot_teams[level].hot_team; 4857 #else 4858 team = root->r.r_hot_team; 4859 #endif 4860 #if KMP_DEBUG 4861 if (__kmp_tasking_mode != tskm_immediate_exec) { 4862 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 4863 "task_team[1] = %p before reinit\n", 4864 team->t.t_task_team[0], team->t.t_task_team[1])); 4865 } 4866 #endif 4867 4868 // Has the number of threads changed? 4869 /* Let's assume the most common case is that the number of threads is 4870 unchanged, and put that case first. */ 4871 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 4872 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 4873 // This case can mean that omp_set_num_threads() was called and the hot 4874 // team size was already reduced, so we check the special flag 4875 if (team->t.t_size_changed == -1) { 4876 team->t.t_size_changed = 1; 4877 } else { 4878 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 4879 } 4880 4881 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4882 kmp_r_sched_t new_sched = new_icvs->sched; 4883 // set master's schedule as new run-time schedule 4884 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 4885 4886 __kmp_reinitialize_team(team, new_icvs, 4887 root->r.r_uber_thread->th.th_ident); 4888 4889 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 4890 team->t.t_threads[0], team)); 4891 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 4892 4893 #if KMP_AFFINITY_SUPPORTED 4894 if ((team->t.t_size_changed == 0) && 4895 (team->t.t_proc_bind == new_proc_bind)) { 4896 if (new_proc_bind == proc_bind_spread) { 4897 __kmp_partition_places( 4898 team, 1); // add flag to update only master for spread 4899 } 4900 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 4901 "proc_bind = %d, partition = [%d,%d]\n", 4902 team->t.t_id, new_proc_bind, team->t.t_first_place, 4903 team->t.t_last_place)); 4904 } else { 4905 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 4906 __kmp_partition_places(team); 4907 } 4908 #else 4909 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 4910 #endif /* KMP_AFFINITY_SUPPORTED */ 4911 } else if (team->t.t_nproc > new_nproc) { 4912 KA_TRACE(20, 4913 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 4914 new_nproc)); 4915 4916 team->t.t_size_changed = 1; 4917 #if KMP_NESTED_HOT_TEAMS 4918 if (__kmp_hot_teams_mode == 0) { 4919 // AC: saved number of threads should correspond to team's value in this 4920 // mode, can be bigger in mode 1, when hot team has threads in reserve 4921 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 4922 hot_teams[level].hot_team_nth = new_nproc; 4923 #endif // KMP_NESTED_HOT_TEAMS 4924 /* release the extra threads we don't need any more */ 4925 for (f = new_nproc; f < team->t.t_nproc; f++) { 4926 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 4927 if (__kmp_tasking_mode != tskm_immediate_exec) { 4928 // When decreasing team size, threads no longer in the team should 4929 // unref task team. 4930 team->t.t_threads[f]->th.th_task_team = NULL; 4931 } 4932 __kmp_free_thread(team->t.t_threads[f]); 4933 team->t.t_threads[f] = NULL; 4934 } 4935 #if KMP_NESTED_HOT_TEAMS 4936 } // (__kmp_hot_teams_mode == 0) 4937 else { 4938 // When keeping extra threads in team, switch threads to wait on own 4939 // b_go flag 4940 for (f = new_nproc; f < team->t.t_nproc; ++f) { 4941 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 4942 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 4943 for (int b = 0; b < bs_last_barrier; ++b) { 4944 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 4945 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 4946 } 4947 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 4948 } 4949 } 4950 } 4951 #endif // KMP_NESTED_HOT_TEAMS 4952 team->t.t_nproc = new_nproc; 4953 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4954 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 4955 __kmp_reinitialize_team(team, new_icvs, 4956 root->r.r_uber_thread->th.th_ident); 4957 4958 // Update remaining threads 4959 for (f = 0; f < new_nproc; ++f) { 4960 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 4961 } 4962 4963 // restore the current task state of the master thread: should be the 4964 // implicit task 4965 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 4966 team->t.t_threads[0], team)); 4967 4968 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 4969 4970 #ifdef KMP_DEBUG 4971 for (f = 0; f < team->t.t_nproc; f++) { 4972 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 4973 team->t.t_threads[f]->th.th_team_nproc == 4974 team->t.t_nproc); 4975 } 4976 #endif 4977 4978 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 4979 #if KMP_AFFINITY_SUPPORTED 4980 __kmp_partition_places(team); 4981 #endif 4982 } else { // team->t.t_nproc < new_nproc 4983 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED 4984 kmp_affin_mask_t *old_mask; 4985 if (KMP_AFFINITY_CAPABLE()) { 4986 KMP_CPU_ALLOC(old_mask); 4987 } 4988 #endif 4989 4990 KA_TRACE(20, 4991 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 4992 new_nproc)); 4993 4994 team->t.t_size_changed = 1; 4995 4996 #if KMP_NESTED_HOT_TEAMS 4997 int avail_threads = hot_teams[level].hot_team_nth; 4998 if (new_nproc < avail_threads) 4999 avail_threads = new_nproc; 5000 kmp_info_t **other_threads = team->t.t_threads; 5001 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5002 // Adjust barrier data of reserved threads (if any) of the team 5003 // Other data will be set in __kmp_initialize_info() below. 5004 int b; 5005 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5006 for (b = 0; b < bs_last_barrier; ++b) { 5007 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5008 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5009 #if USE_DEBUGGER 5010 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5011 #endif 5012 } 5013 } 5014 if (hot_teams[level].hot_team_nth >= new_nproc) { 5015 // we have all needed threads in reserve, no need to allocate any 5016 // this only possible in mode 1, cannot have reserved threads in mode 0 5017 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5018 team->t.t_nproc = new_nproc; // just get reserved threads involved 5019 } else { 5020 // we may have some threads in reserve, but not enough 5021 team->t.t_nproc = 5022 hot_teams[level] 5023 .hot_team_nth; // get reserved threads involved if any 5024 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5025 #endif // KMP_NESTED_HOT_TEAMS 5026 if (team->t.t_max_nproc < new_nproc) { 5027 /* reallocate larger arrays */ 5028 __kmp_reallocate_team_arrays(team, new_nproc); 5029 __kmp_reinitialize_team(team, new_icvs, NULL); 5030 } 5031 5032 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED 5033 /* Temporarily set full mask for master thread before creation of 5034 workers. The reason is that workers inherit the affinity from master, 5035 so if a lot of workers are created on the single core quickly, they 5036 don't get a chance to set their own affinity for a long time. */ 5037 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5038 #endif 5039 5040 /* allocate new threads for the hot team */ 5041 for (f = team->t.t_nproc; f < new_nproc; f++) { 5042 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5043 KMP_DEBUG_ASSERT(new_worker); 5044 team->t.t_threads[f] = new_worker; 5045 5046 KA_TRACE(20, 5047 ("__kmp_allocate_team: team %d init T#%d arrived: " 5048 "join=%llu, plain=%llu\n", 5049 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5050 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5051 team->t.t_bar[bs_plain_barrier].b_arrived)); 5052 5053 { // Initialize barrier data for new threads. 5054 int b; 5055 kmp_balign_t *balign = new_worker->th.th_bar; 5056 for (b = 0; b < bs_last_barrier; ++b) { 5057 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5058 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5059 KMP_BARRIER_PARENT_FLAG); 5060 #if USE_DEBUGGER 5061 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5062 #endif 5063 } 5064 } 5065 } 5066 5067 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED 5068 if (KMP_AFFINITY_CAPABLE()) { 5069 /* Restore initial master thread's affinity mask */ 5070 __kmp_set_system_affinity(old_mask, TRUE); 5071 KMP_CPU_FREE(old_mask); 5072 } 5073 #endif 5074 #if KMP_NESTED_HOT_TEAMS 5075 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5076 #endif // KMP_NESTED_HOT_TEAMS 5077 /* make sure everyone is syncronized */ 5078 int old_nproc = team->t.t_nproc; // save old value and use to update only 5079 // new threads below 5080 __kmp_initialize_team(team, new_nproc, new_icvs, 5081 root->r.r_uber_thread->th.th_ident); 5082 5083 /* reinitialize the threads */ 5084 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5085 for (f = 0; f < team->t.t_nproc; ++f) 5086 __kmp_initialize_info(team->t.t_threads[f], team, f, 5087 __kmp_gtid_from_tid(f, team)); 5088 5089 if (level) { // set th_task_state for new threads in nested hot team 5090 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5091 // only need to set the th_task_state for the new threads. th_task_state 5092 // for master thread will not be accurate until after this in 5093 // __kmp_fork_call(), so we look to the master's memo_stack to get the 5094 // correct value. 5095 for (f = old_nproc; f < team->t.t_nproc; ++f) 5096 team->t.t_threads[f]->th.th_task_state = 5097 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5098 } else { // set th_task_state for new threads in non-nested hot team 5099 int old_state = 5100 team->t.t_threads[0]->th.th_task_state; // copy master's state 5101 for (f = old_nproc; f < team->t.t_nproc; ++f) 5102 team->t.t_threads[f]->th.th_task_state = old_state; 5103 } 5104 5105 #ifdef KMP_DEBUG 5106 for (f = 0; f < team->t.t_nproc; ++f) { 5107 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5108 team->t.t_threads[f]->th.th_team_nproc == 5109 team->t.t_nproc); 5110 } 5111 #endif 5112 5113 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5114 #if KMP_AFFINITY_SUPPORTED 5115 __kmp_partition_places(team); 5116 #endif 5117 } // Check changes in number of threads 5118 5119 kmp_info_t *master = team->t.t_threads[0]; 5120 if (master->th.th_teams_microtask) { 5121 for (f = 1; f < new_nproc; ++f) { 5122 // propagate teams construct specific info to workers 5123 kmp_info_t *thr = team->t.t_threads[f]; 5124 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5125 thr->th.th_teams_level = master->th.th_teams_level; 5126 thr->th.th_teams_size = master->th.th_teams_size; 5127 } 5128 } 5129 #if KMP_NESTED_HOT_TEAMS 5130 if (level) { 5131 // Sync barrier state for nested hot teams, not needed for outermost hot 5132 // team. 5133 for (f = 1; f < new_nproc; ++f) { 5134 kmp_info_t *thr = team->t.t_threads[f]; 5135 int b; 5136 kmp_balign_t *balign = thr->th.th_bar; 5137 for (b = 0; b < bs_last_barrier; ++b) { 5138 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5139 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5140 #if USE_DEBUGGER 5141 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5142 #endif 5143 } 5144 } 5145 } 5146 #endif // KMP_NESTED_HOT_TEAMS 5147 5148 /* reallocate space for arguments if necessary */ 5149 __kmp_alloc_argv_entries(argc, team, TRUE); 5150 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5151 // The hot team re-uses the previous task team, 5152 // if untouched during the previous release->gather phase. 5153 5154 KF_TRACE(10, (" hot_team = %p\n", team)); 5155 5156 #if KMP_DEBUG 5157 if (__kmp_tasking_mode != tskm_immediate_exec) { 5158 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5159 "task_team[1] = %p after reinit\n", 5160 team->t.t_task_team[0], team->t.t_task_team[1])); 5161 } 5162 #endif 5163 5164 #if OMPT_SUPPORT 5165 __ompt_team_assign_id(team, ompt_parallel_data); 5166 #endif 5167 5168 KMP_MB(); 5169 5170 return team; 5171 } 5172 5173 /* next, let's try to take one from the team pool */ 5174 KMP_MB(); 5175 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5176 /* TODO: consider resizing undersized teams instead of reaping them, now 5177 that we have a resizing mechanism */ 5178 if (team->t.t_max_nproc >= max_nproc) { 5179 /* take this team from the team pool */ 5180 __kmp_team_pool = team->t.t_next_pool; 5181 5182 /* setup the team for fresh use */ 5183 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5184 5185 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5186 "task_team[1] %p to NULL\n", 5187 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5188 team->t.t_task_team[0] = NULL; 5189 team->t.t_task_team[1] = NULL; 5190 5191 /* reallocate space for arguments if necessary */ 5192 __kmp_alloc_argv_entries(argc, team, TRUE); 5193 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5194 5195 KA_TRACE( 5196 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5197 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5198 { // Initialize barrier data. 5199 int b; 5200 for (b = 0; b < bs_last_barrier; ++b) { 5201 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5202 #if USE_DEBUGGER 5203 team->t.t_bar[b].b_master_arrived = 0; 5204 team->t.t_bar[b].b_team_arrived = 0; 5205 #endif 5206 } 5207 } 5208 5209 team->t.t_proc_bind = new_proc_bind; 5210 5211 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5212 team->t.t_id)); 5213 5214 #if OMPT_SUPPORT 5215 __ompt_team_assign_id(team, ompt_parallel_data); 5216 #endif 5217 5218 KMP_MB(); 5219 5220 return team; 5221 } 5222 5223 /* reap team if it is too small, then loop back and check the next one */ 5224 // not sure if this is wise, but, will be redone during the hot-teams 5225 // rewrite. 5226 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5227 team = __kmp_reap_team(team); 5228 __kmp_team_pool = team; 5229 } 5230 5231 /* nothing available in the pool, no matter, make a new team! */ 5232 KMP_MB(); 5233 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5234 5235 /* and set it up */ 5236 team->t.t_max_nproc = max_nproc; 5237 /* NOTE well, for some reason allocating one big buffer and dividing it up 5238 seems to really hurt performance a lot on the P4, so, let's not use this */ 5239 __kmp_allocate_team_arrays(team, max_nproc); 5240 5241 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5242 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5243 5244 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5245 "%p to NULL\n", 5246 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5247 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5248 // memory, no need to duplicate 5249 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5250 // memory, no need to duplicate 5251 5252 if (__kmp_storage_map) { 5253 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5254 } 5255 5256 /* allocate space for arguments */ 5257 __kmp_alloc_argv_entries(argc, team, FALSE); 5258 team->t.t_argc = argc; 5259 5260 KA_TRACE(20, 5261 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5262 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5263 { // Initialize barrier data. 5264 int b; 5265 for (b = 0; b < bs_last_barrier; ++b) { 5266 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5267 #if USE_DEBUGGER 5268 team->t.t_bar[b].b_master_arrived = 0; 5269 team->t.t_bar[b].b_team_arrived = 0; 5270 #endif 5271 } 5272 } 5273 5274 team->t.t_proc_bind = new_proc_bind; 5275 5276 #if OMPT_SUPPORT 5277 __ompt_team_assign_id(team, ompt_parallel_data); 5278 team->t.ompt_serialized_team_info = NULL; 5279 #endif 5280 5281 KMP_MB(); 5282 5283 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5284 team->t.t_id)); 5285 5286 return team; 5287 } 5288 5289 /* TODO implement hot-teams at all levels */ 5290 /* TODO implement lazy thread release on demand (disband request) */ 5291 5292 /* free the team. return it to the team pool. release all the threads 5293 * associated with it */ 5294 void __kmp_free_team(kmp_root_t *root, 5295 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5296 int f; 5297 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5298 team->t.t_id)); 5299 5300 /* verify state */ 5301 KMP_DEBUG_ASSERT(root); 5302 KMP_DEBUG_ASSERT(team); 5303 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5304 KMP_DEBUG_ASSERT(team->t.t_threads); 5305 5306 int use_hot_team = team == root->r.r_hot_team; 5307 #if KMP_NESTED_HOT_TEAMS 5308 int level; 5309 kmp_hot_team_ptr_t *hot_teams; 5310 if (master) { 5311 level = team->t.t_active_level - 1; 5312 if (master->th.th_teams_microtask) { // in teams construct? 5313 if (master->th.th_teams_size.nteams > 1) { 5314 ++level; // level was not increased in teams construct for 5315 // team_of_masters 5316 } 5317 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5318 master->th.th_teams_level == team->t.t_level) { 5319 ++level; // level was not increased in teams construct for 5320 // team_of_workers before the parallel 5321 } // team->t.t_level will be increased inside parallel 5322 } 5323 hot_teams = master->th.th_hot_teams; 5324 if (level < __kmp_hot_teams_max_level) { 5325 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5326 use_hot_team = 1; 5327 } 5328 } 5329 #endif // KMP_NESTED_HOT_TEAMS 5330 5331 /* team is done working */ 5332 TCW_SYNC_PTR(team->t.t_pkfn, 5333 NULL); // Important for Debugging Support Library. 5334 #if KMP_OS_WINDOWS 5335 team->t.t_copyin_counter = 0; // init counter for possible reuse 5336 #endif 5337 // Do not reset pointer to parent team to NULL for hot teams. 5338 5339 /* if we are non-hot team, release our threads */ 5340 if (!use_hot_team) { 5341 if (__kmp_tasking_mode != tskm_immediate_exec) { 5342 // Wait for threads to reach reapable state 5343 for (f = 1; f < team->t.t_nproc; ++f) { 5344 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5345 kmp_info_t *th = team->t.t_threads[f]; 5346 volatile kmp_uint32 *state = &th->th.th_reap_state; 5347 while (*state != KMP_SAFE_TO_REAP) { 5348 #if KMP_OS_WINDOWS 5349 // On Windows a thread can be killed at any time, check this 5350 DWORD ecode; 5351 if (!__kmp_is_thread_alive(th, &ecode)) { 5352 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5353 break; 5354 } 5355 #endif 5356 // first check if thread is sleeping 5357 kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5358 if (fl.is_sleeping()) 5359 fl.resume(__kmp_gtid_from_thread(th)); 5360 KMP_CPU_PAUSE(); 5361 } 5362 } 5363 5364 // Delete task teams 5365 int tt_idx; 5366 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5367 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5368 if (task_team != NULL) { 5369 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5370 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5371 team->t.t_threads[f]->th.th_task_team = NULL; 5372 } 5373 KA_TRACE( 5374 20, 5375 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5376 __kmp_get_gtid(), task_team, team->t.t_id)); 5377 #if KMP_NESTED_HOT_TEAMS 5378 __kmp_free_task_team(master, task_team); 5379 #endif 5380 team->t.t_task_team[tt_idx] = NULL; 5381 } 5382 } 5383 } 5384 5385 // Reset pointer to parent team only for non-hot teams. 5386 team->t.t_parent = NULL; 5387 team->t.t_level = 0; 5388 team->t.t_active_level = 0; 5389 5390 /* free the worker threads */ 5391 for (f = 1; f < team->t.t_nproc; ++f) { 5392 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5393 __kmp_free_thread(team->t.t_threads[f]); 5394 team->t.t_threads[f] = NULL; 5395 } 5396 5397 /* put the team back in the team pool */ 5398 /* TODO limit size of team pool, call reap_team if pool too large */ 5399 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5400 __kmp_team_pool = (volatile kmp_team_t *)team; 5401 } else { // Check if team was created for the masters in a teams construct 5402 // See if first worker is a CG root 5403 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5404 team->t.t_threads[1]->th.th_cg_roots); 5405 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5406 // Clean up the CG root nodes on workers so that this team can be re-used 5407 for (f = 1; f < team->t.t_nproc; ++f) { 5408 kmp_info_t *thr = team->t.t_threads[f]; 5409 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5410 thr->th.th_cg_roots->cg_root == thr); 5411 // Pop current CG root off list 5412 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5413 thr->th.th_cg_roots = tmp->up; 5414 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5415 " up to node %p. cg_nthreads was %d\n", 5416 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5417 int i = tmp->cg_nthreads--; 5418 if (i == 1) { 5419 __kmp_free(tmp); // free CG if we are the last thread in it 5420 } 5421 // Restore current task's thread_limit from CG root 5422 if (thr->th.th_cg_roots) 5423 thr->th.th_current_task->td_icvs.thread_limit = 5424 thr->th.th_cg_roots->cg_thread_limit; 5425 } 5426 } 5427 } 5428 5429 KMP_MB(); 5430 } 5431 5432 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5433 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5434 kmp_team_t *next_pool = team->t.t_next_pool; 5435 5436 KMP_DEBUG_ASSERT(team); 5437 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5438 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5439 KMP_DEBUG_ASSERT(team->t.t_threads); 5440 KMP_DEBUG_ASSERT(team->t.t_argv); 5441 5442 /* TODO clean the threads that are a part of this? */ 5443 5444 /* free stuff */ 5445 __kmp_free_team_arrays(team); 5446 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5447 __kmp_free((void *)team->t.t_argv); 5448 __kmp_free(team); 5449 5450 KMP_MB(); 5451 return next_pool; 5452 } 5453 5454 // Free the thread. Don't reap it, just place it on the pool of available 5455 // threads. 5456 // 5457 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5458 // binding for the affinity mechanism to be useful. 5459 // 5460 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5461 // However, we want to avoid a potential performance problem by always 5462 // scanning through the list to find the correct point at which to insert 5463 // the thread (potential N**2 behavior). To do this we keep track of the 5464 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5465 // With single-level parallelism, threads will always be added to the tail 5466 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5467 // parallelism, all bets are off and we may need to scan through the entire 5468 // free list. 5469 // 5470 // This change also has a potentially large performance benefit, for some 5471 // applications. Previously, as threads were freed from the hot team, they 5472 // would be placed back on the free list in inverse order. If the hot team 5473 // grew back to it's original size, then the freed thread would be placed 5474 // back on the hot team in reverse order. This could cause bad cache 5475 // locality problems on programs where the size of the hot team regularly 5476 // grew and shrunk. 5477 // 5478 // Now, for single-level parallelism, the OMP tid is alway == gtid. 5479 void __kmp_free_thread(kmp_info_t *this_th) { 5480 int gtid; 5481 kmp_info_t **scan; 5482 5483 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5484 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5485 5486 KMP_DEBUG_ASSERT(this_th); 5487 5488 // When moving thread to pool, switch thread to wait on own b_go flag, and 5489 // uninitialized (NULL team). 5490 int b; 5491 kmp_balign_t *balign = this_th->th.th_bar; 5492 for (b = 0; b < bs_last_barrier; ++b) { 5493 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5494 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5495 balign[b].bb.team = NULL; 5496 balign[b].bb.leaf_kids = 0; 5497 } 5498 this_th->th.th_task_state = 0; 5499 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5500 5501 /* put thread back on the free pool */ 5502 TCW_PTR(this_th->th.th_team, NULL); 5503 TCW_PTR(this_th->th.th_root, NULL); 5504 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5505 5506 while (this_th->th.th_cg_roots) { 5507 this_th->th.th_cg_roots->cg_nthreads--; 5508 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5509 " %p of thread %p to %d\n", 5510 this_th, this_th->th.th_cg_roots, 5511 this_th->th.th_cg_roots->cg_root, 5512 this_th->th.th_cg_roots->cg_nthreads)); 5513 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5514 if (tmp->cg_root == this_th) { // Thread is a cg_root 5515 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5516 KA_TRACE( 5517 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5518 this_th->th.th_cg_roots = tmp->up; 5519 __kmp_free(tmp); 5520 } else { // Worker thread 5521 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5522 __kmp_free(tmp); 5523 } 5524 this_th->th.th_cg_roots = NULL; 5525 break; 5526 } 5527 } 5528 5529 /* If the implicit task assigned to this thread can be used by other threads 5530 * -> multiple threads can share the data and try to free the task at 5531 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5532 * with higher probability when hot team is disabled but can occurs even when 5533 * the hot team is enabled */ 5534 __kmp_free_implicit_task(this_th); 5535 this_th->th.th_current_task = NULL; 5536 5537 // If the __kmp_thread_pool_insert_pt is already past the new insert 5538 // point, then we need to re-scan the entire list. 5539 gtid = this_th->th.th_info.ds.ds_gtid; 5540 if (__kmp_thread_pool_insert_pt != NULL) { 5541 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5542 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5543 __kmp_thread_pool_insert_pt = NULL; 5544 } 5545 } 5546 5547 // Scan down the list to find the place to insert the thread. 5548 // scan is the address of a link in the list, possibly the address of 5549 // __kmp_thread_pool itself. 5550 // 5551 // In the absence of nested parallism, the for loop will have 0 iterations. 5552 if (__kmp_thread_pool_insert_pt != NULL) { 5553 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5554 } else { 5555 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5556 } 5557 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5558 scan = &((*scan)->th.th_next_pool)) 5559 ; 5560 5561 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5562 // to its address. 5563 TCW_PTR(this_th->th.th_next_pool, *scan); 5564 __kmp_thread_pool_insert_pt = *scan = this_th; 5565 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5566 (this_th->th.th_info.ds.ds_gtid < 5567 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5568 TCW_4(this_th->th.th_in_pool, TRUE); 5569 __kmp_suspend_initialize_thread(this_th); 5570 __kmp_lock_suspend_mx(this_th); 5571 if (this_th->th.th_active == TRUE) { 5572 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5573 this_th->th.th_active_in_pool = TRUE; 5574 } 5575 #if KMP_DEBUG 5576 else { 5577 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5578 } 5579 #endif 5580 __kmp_unlock_suspend_mx(this_th); 5581 5582 TCW_4(__kmp_nth, __kmp_nth - 1); 5583 5584 #ifdef KMP_ADJUST_BLOCKTIME 5585 /* Adjust blocktime back to user setting or default if necessary */ 5586 /* Middle initialization might never have occurred */ 5587 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5588 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5589 if (__kmp_nth <= __kmp_avail_proc) { 5590 __kmp_zero_bt = FALSE; 5591 } 5592 } 5593 #endif /* KMP_ADJUST_BLOCKTIME */ 5594 5595 KMP_MB(); 5596 } 5597 5598 /* ------------------------------------------------------------------------ */ 5599 5600 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5601 int gtid = this_thr->th.th_info.ds.ds_gtid; 5602 /* void *stack_data;*/ 5603 kmp_team_t *(*volatile pteam); 5604 5605 KMP_MB(); 5606 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5607 5608 if (__kmp_env_consistency_check) { 5609 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5610 } 5611 5612 #if OMPT_SUPPORT 5613 ompt_data_t *thread_data; 5614 if (ompt_enabled.enabled) { 5615 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5616 *thread_data = ompt_data_none; 5617 5618 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5619 this_thr->th.ompt_thread_info.wait_id = 0; 5620 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5621 if (ompt_enabled.ompt_callback_thread_begin) { 5622 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5623 ompt_thread_worker, thread_data); 5624 } 5625 } 5626 #endif 5627 5628 #if OMPT_SUPPORT 5629 if (ompt_enabled.enabled) { 5630 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5631 } 5632 #endif 5633 /* This is the place where threads wait for work */ 5634 while (!TCR_4(__kmp_global.g.g_done)) { 5635 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5636 KMP_MB(); 5637 5638 /* wait for work to do */ 5639 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5640 5641 /* No tid yet since not part of a team */ 5642 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5643 5644 #if OMPT_SUPPORT 5645 if (ompt_enabled.enabled) { 5646 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5647 } 5648 #endif 5649 5650 pteam = (kmp_team_t * (*))(&this_thr->th.th_team); 5651 5652 /* have we been allocated? */ 5653 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5654 /* we were just woken up, so run our new task */ 5655 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5656 int rc; 5657 KA_TRACE(20, 5658 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5659 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5660 (*pteam)->t.t_pkfn)); 5661 5662 updateHWFPControl(*pteam); 5663 5664 #if OMPT_SUPPORT 5665 if (ompt_enabled.enabled) { 5666 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 5667 } 5668 #endif 5669 5670 rc = (*pteam)->t.t_invoke(gtid); 5671 KMP_ASSERT(rc); 5672 5673 KMP_MB(); 5674 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5675 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5676 (*pteam)->t.t_pkfn)); 5677 } 5678 #if OMPT_SUPPORT 5679 if (ompt_enabled.enabled) { 5680 /* no frame set while outside task */ 5681 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 5682 5683 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5684 } 5685 #endif 5686 /* join barrier after parallel region */ 5687 __kmp_join_barrier(gtid); 5688 } 5689 } 5690 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5691 5692 #if OMPT_SUPPORT 5693 if (ompt_enabled.ompt_callback_thread_end) { 5694 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5695 } 5696 #endif 5697 5698 this_thr->th.th_task_team = NULL; 5699 /* run the destructors for the threadprivate data for this thread */ 5700 __kmp_common_destroy_gtid(gtid); 5701 5702 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5703 KMP_MB(); 5704 return this_thr; 5705 } 5706 5707 /* ------------------------------------------------------------------------ */ 5708 5709 void __kmp_internal_end_dest(void *specific_gtid) { 5710 #if KMP_COMPILER_ICC 5711 #pragma warning(push) 5712 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose 5713 // significant bits 5714 #endif 5715 // Make sure no significant bits are lost 5716 int gtid = (kmp_intptr_t)specific_gtid - 1; 5717 #if KMP_COMPILER_ICC 5718 #pragma warning(pop) 5719 #endif 5720 5721 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5722 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5723 * this is because 0 is reserved for the nothing-stored case */ 5724 5725 /* josh: One reason for setting the gtid specific data even when it is being 5726 destroyed by pthread is to allow gtid lookup through thread specific data 5727 (__kmp_gtid_get_specific). Some of the code, especially stat code, 5728 that gets executed in the call to __kmp_internal_end_thread, actually 5729 gets the gtid through the thread specific data. Setting it here seems 5730 rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread 5731 to run smoothly. 5732 todo: get rid of this after we remove the dependence on 5733 __kmp_gtid_get_specific */ 5734 if (gtid >= 0 && KMP_UBER_GTID(gtid)) 5735 __kmp_gtid_set_specific(gtid); 5736 #ifdef KMP_TDATA_GTID 5737 __kmp_gtid = gtid; 5738 #endif 5739 __kmp_internal_end_thread(gtid); 5740 } 5741 5742 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 5743 5744 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases 5745 // destructors work perfectly, but in real libomp.so I have no evidence it is 5746 // ever called. However, -fini linker option in makefile.mk works fine. 5747 5748 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 5749 __kmp_internal_end_atexit(); 5750 } 5751 5752 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); } 5753 5754 #endif 5755 5756 /* [Windows] josh: when the atexit handler is called, there may still be more 5757 than one thread alive */ 5758 void __kmp_internal_end_atexit(void) { 5759 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 5760 /* [Windows] 5761 josh: ideally, we want to completely shutdown the library in this atexit 5762 handler, but stat code that depends on thread specific data for gtid fails 5763 because that data becomes unavailable at some point during the shutdown, so 5764 we call __kmp_internal_end_thread instead. We should eventually remove the 5765 dependency on __kmp_get_specific_gtid in the stat code and use 5766 __kmp_internal_end_library to cleanly shutdown the library. 5767 5768 // TODO: Can some of this comment about GVS be removed? 5769 I suspect that the offending stat code is executed when the calling thread 5770 tries to clean up a dead root thread's data structures, resulting in GVS 5771 code trying to close the GVS structures for that thread, but since the stat 5772 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 5773 the calling thread is cleaning up itself instead of another thread, it get 5774 confused. This happens because allowing a thread to unregister and cleanup 5775 another thread is a recent modification for addressing an issue. 5776 Based on the current design (20050722), a thread may end up 5777 trying to unregister another thread only if thread death does not trigger 5778 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 5779 thread specific data destructor function to detect thread death. For 5780 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 5781 is nothing. Thus, the workaround is applicable only for Windows static 5782 stat library. */ 5783 __kmp_internal_end_library(-1); 5784 #if KMP_OS_WINDOWS 5785 __kmp_close_console(); 5786 #endif 5787 } 5788 5789 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 5790 // It is assumed __kmp_forkjoin_lock is acquired. 5791 5792 int gtid; 5793 5794 KMP_DEBUG_ASSERT(thread != NULL); 5795 5796 gtid = thread->th.th_info.ds.ds_gtid; 5797 5798 if (!is_root) { 5799 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5800 /* Assume the threads are at the fork barrier here */ 5801 KA_TRACE( 5802 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 5803 gtid)); 5804 /* Need release fence here to prevent seg faults for tree forkjoin barrier 5805 * (GEH) */ 5806 ANNOTATE_HAPPENS_BEFORE(thread); 5807 kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread); 5808 __kmp_release_64(&flag); 5809 } 5810 5811 // Terminate OS thread. 5812 __kmp_reap_worker(thread); 5813 5814 // The thread was killed asynchronously. If it was actively 5815 // spinning in the thread pool, decrement the global count. 5816 // 5817 // There is a small timing hole here - if the worker thread was just waking 5818 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 5819 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 5820 // the global counter might not get updated. 5821 // 5822 // Currently, this can only happen as the library is unloaded, 5823 // so there are no harmful side effects. 5824 if (thread->th.th_active_in_pool) { 5825 thread->th.th_active_in_pool = FALSE; 5826 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 5827 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 5828 } 5829 } 5830 5831 __kmp_free_implicit_task(thread); 5832 5833 // Free the fast memory for tasking 5834 #if USE_FAST_MEMORY 5835 __kmp_free_fast_memory(thread); 5836 #endif /* USE_FAST_MEMORY */ 5837 5838 __kmp_suspend_uninitialize_thread(thread); 5839 5840 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 5841 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 5842 5843 --__kmp_all_nth; 5844 // __kmp_nth was decremented when thread is added to the pool. 5845 5846 #ifdef KMP_ADJUST_BLOCKTIME 5847 /* Adjust blocktime back to user setting or default if necessary */ 5848 /* Middle initialization might never have occurred */ 5849 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5850 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5851 if (__kmp_nth <= __kmp_avail_proc) { 5852 __kmp_zero_bt = FALSE; 5853 } 5854 } 5855 #endif /* KMP_ADJUST_BLOCKTIME */ 5856 5857 /* free the memory being used */ 5858 if (__kmp_env_consistency_check) { 5859 if (thread->th.th_cons) { 5860 __kmp_free_cons_stack(thread->th.th_cons); 5861 thread->th.th_cons = NULL; 5862 } 5863 } 5864 5865 if (thread->th.th_pri_common != NULL) { 5866 __kmp_free(thread->th.th_pri_common); 5867 thread->th.th_pri_common = NULL; 5868 } 5869 5870 if (thread->th.th_task_state_memo_stack != NULL) { 5871 __kmp_free(thread->th.th_task_state_memo_stack); 5872 thread->th.th_task_state_memo_stack = NULL; 5873 } 5874 5875 #if KMP_USE_BGET 5876 if (thread->th.th_local.bget_data != NULL) { 5877 __kmp_finalize_bget(thread); 5878 } 5879 #endif 5880 5881 #if KMP_AFFINITY_SUPPORTED 5882 if (thread->th.th_affin_mask != NULL) { 5883 KMP_CPU_FREE(thread->th.th_affin_mask); 5884 thread->th.th_affin_mask = NULL; 5885 } 5886 #endif /* KMP_AFFINITY_SUPPORTED */ 5887 5888 #if KMP_USE_HIER_SCHED 5889 if (thread->th.th_hier_bar_data != NULL) { 5890 __kmp_free(thread->th.th_hier_bar_data); 5891 thread->th.th_hier_bar_data = NULL; 5892 } 5893 #endif 5894 5895 __kmp_reap_team(thread->th.th_serial_team); 5896 thread->th.th_serial_team = NULL; 5897 __kmp_free(thread); 5898 5899 KMP_MB(); 5900 5901 } // __kmp_reap_thread 5902 5903 static void __kmp_internal_end(void) { 5904 int i; 5905 5906 /* First, unregister the library */ 5907 __kmp_unregister_library(); 5908 5909 #if KMP_OS_WINDOWS 5910 /* In Win static library, we can't tell when a root actually dies, so we 5911 reclaim the data structures for any root threads that have died but not 5912 unregistered themselves, in order to shut down cleanly. 5913 In Win dynamic library we also can't tell when a thread dies. */ 5914 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 5915 // dead roots 5916 #endif 5917 5918 for (i = 0; i < __kmp_threads_capacity; i++) 5919 if (__kmp_root[i]) 5920 if (__kmp_root[i]->r.r_active) 5921 break; 5922 KMP_MB(); /* Flush all pending memory write invalidates. */ 5923 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 5924 5925 if (i < __kmp_threads_capacity) { 5926 #if KMP_USE_MONITOR 5927 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 5928 KMP_MB(); /* Flush all pending memory write invalidates. */ 5929 5930 // Need to check that monitor was initialized before reaping it. If we are 5931 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 5932 // __kmp_monitor will appear to contain valid data, but it is only valid in 5933 // the parent process, not the child. 5934 // New behavior (201008): instead of keying off of the flag 5935 // __kmp_init_parallel, the monitor thread creation is keyed off 5936 // of the new flag __kmp_init_monitor. 5937 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 5938 if (TCR_4(__kmp_init_monitor)) { 5939 __kmp_reap_monitor(&__kmp_monitor); 5940 TCW_4(__kmp_init_monitor, 0); 5941 } 5942 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 5943 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 5944 #endif // KMP_USE_MONITOR 5945 } else { 5946 /* TODO move this to cleanup code */ 5947 #ifdef KMP_DEBUG 5948 /* make sure that everything has properly ended */ 5949 for (i = 0; i < __kmp_threads_capacity; i++) { 5950 if (__kmp_root[i]) { 5951 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 5952 // there can be uber threads alive here 5953 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 5954 } 5955 } 5956 #endif 5957 5958 KMP_MB(); 5959 5960 // Reap the worker threads. 5961 // This is valid for now, but be careful if threads are reaped sooner. 5962 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 5963 // Get the next thread from the pool. 5964 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 5965 __kmp_thread_pool = thread->th.th_next_pool; 5966 // Reap it. 5967 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 5968 thread->th.th_next_pool = NULL; 5969 thread->th.th_in_pool = FALSE; 5970 __kmp_reap_thread(thread, 0); 5971 } 5972 __kmp_thread_pool_insert_pt = NULL; 5973 5974 // Reap teams. 5975 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 5976 // Get the next team from the pool. 5977 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 5978 __kmp_team_pool = team->t.t_next_pool; 5979 // Reap it. 5980 team->t.t_next_pool = NULL; 5981 __kmp_reap_team(team); 5982 } 5983 5984 __kmp_reap_task_teams(); 5985 5986 #if KMP_OS_UNIX 5987 // Threads that are not reaped should not access any resources since they 5988 // are going to be deallocated soon, so the shutdown sequence should wait 5989 // until all threads either exit the final spin-waiting loop or begin 5990 // sleeping after the given blocktime. 5991 for (i = 0; i < __kmp_threads_capacity; i++) { 5992 kmp_info_t *thr = __kmp_threads[i]; 5993 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 5994 KMP_CPU_PAUSE(); 5995 } 5996 #endif 5997 5998 for (i = 0; i < __kmp_threads_capacity; ++i) { 5999 // TBD: Add some checking... 6000 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6001 } 6002 6003 /* Make sure all threadprivate destructors get run by joining with all 6004 worker threads before resetting this flag */ 6005 TCW_SYNC_4(__kmp_init_common, FALSE); 6006 6007 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6008 KMP_MB(); 6009 6010 #if KMP_USE_MONITOR 6011 // See note above: One of the possible fixes for CQ138434 / CQ140126 6012 // 6013 // FIXME: push both code fragments down and CSE them? 6014 // push them into __kmp_cleanup() ? 6015 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6016 if (TCR_4(__kmp_init_monitor)) { 6017 __kmp_reap_monitor(&__kmp_monitor); 6018 TCW_4(__kmp_init_monitor, 0); 6019 } 6020 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6021 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6022 #endif 6023 } /* else !__kmp_global.t_active */ 6024 TCW_4(__kmp_init_gtid, FALSE); 6025 KMP_MB(); /* Flush all pending memory write invalidates. */ 6026 6027 __kmp_cleanup(); 6028 #if OMPT_SUPPORT 6029 ompt_fini(); 6030 #endif 6031 } 6032 6033 void __kmp_internal_end_library(int gtid_req) { 6034 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6035 /* this shouldn't be a race condition because __kmp_internal_end() is the 6036 only place to clear __kmp_serial_init */ 6037 /* we'll check this later too, after we get the lock */ 6038 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6039 // redundaant, because the next check will work in any case. 6040 if (__kmp_global.g.g_abort) { 6041 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6042 /* TODO abort? */ 6043 return; 6044 } 6045 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6046 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6047 return; 6048 } 6049 6050 KMP_MB(); /* Flush all pending memory write invalidates. */ 6051 6052 /* find out who we are and what we should do */ 6053 { 6054 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6055 KA_TRACE( 6056 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6057 if (gtid == KMP_GTID_SHUTDOWN) { 6058 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6059 "already shutdown\n")); 6060 return; 6061 } else if (gtid == KMP_GTID_MONITOR) { 6062 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6063 "registered, or system shutdown\n")); 6064 return; 6065 } else if (gtid == KMP_GTID_DNE) { 6066 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6067 "shutdown\n")); 6068 /* we don't know who we are, but we may still shutdown the library */ 6069 } else if (KMP_UBER_GTID(gtid)) { 6070 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6071 if (__kmp_root[gtid]->r.r_active) { 6072 __kmp_global.g.g_abort = -1; 6073 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6074 KA_TRACE(10, 6075 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6076 gtid)); 6077 return; 6078 } else { 6079 KA_TRACE( 6080 10, 6081 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6082 __kmp_unregister_root_current_thread(gtid); 6083 } 6084 } else { 6085 /* worker threads may call this function through the atexit handler, if they 6086 * call exit() */ 6087 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6088 TODO: do a thorough shutdown instead */ 6089 #ifdef DUMP_DEBUG_ON_EXIT 6090 if (__kmp_debug_buf) 6091 __kmp_dump_debug_buffer(); 6092 #endif 6093 return; 6094 } 6095 } 6096 /* synchronize the termination process */ 6097 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6098 6099 /* have we already finished */ 6100 if (__kmp_global.g.g_abort) { 6101 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6102 /* TODO abort? */ 6103 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6104 return; 6105 } 6106 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6107 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6108 return; 6109 } 6110 6111 /* We need this lock to enforce mutex between this reading of 6112 __kmp_threads_capacity and the writing by __kmp_register_root. 6113 Alternatively, we can use a counter of roots that is atomically updated by 6114 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6115 __kmp_internal_end_*. */ 6116 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6117 6118 /* now we can safely conduct the actual termination */ 6119 __kmp_internal_end(); 6120 6121 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6122 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6123 6124 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6125 6126 #ifdef DUMP_DEBUG_ON_EXIT 6127 if (__kmp_debug_buf) 6128 __kmp_dump_debug_buffer(); 6129 #endif 6130 6131 #if KMP_OS_WINDOWS 6132 __kmp_close_console(); 6133 #endif 6134 6135 __kmp_fini_allocator(); 6136 6137 } // __kmp_internal_end_library 6138 6139 void __kmp_internal_end_thread(int gtid_req) { 6140 int i; 6141 6142 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6143 /* this shouldn't be a race condition because __kmp_internal_end() is the 6144 * only place to clear __kmp_serial_init */ 6145 /* we'll check this later too, after we get the lock */ 6146 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6147 // redundant, because the next check will work in any case. 6148 if (__kmp_global.g.g_abort) { 6149 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6150 /* TODO abort? */ 6151 return; 6152 } 6153 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6154 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6155 return; 6156 } 6157 6158 KMP_MB(); /* Flush all pending memory write invalidates. */ 6159 6160 /* find out who we are and what we should do */ 6161 { 6162 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6163 KA_TRACE(10, 6164 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6165 if (gtid == KMP_GTID_SHUTDOWN) { 6166 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6167 "already shutdown\n")); 6168 return; 6169 } else if (gtid == KMP_GTID_MONITOR) { 6170 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6171 "registered, or system shutdown\n")); 6172 return; 6173 } else if (gtid == KMP_GTID_DNE) { 6174 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6175 "shutdown\n")); 6176 return; 6177 /* we don't know who we are */ 6178 } else if (KMP_UBER_GTID(gtid)) { 6179 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6180 if (__kmp_root[gtid]->r.r_active) { 6181 __kmp_global.g.g_abort = -1; 6182 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6183 KA_TRACE(10, 6184 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6185 gtid)); 6186 return; 6187 } else { 6188 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6189 gtid)); 6190 __kmp_unregister_root_current_thread(gtid); 6191 } 6192 } else { 6193 /* just a worker thread, let's leave */ 6194 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6195 6196 if (gtid >= 0) { 6197 __kmp_threads[gtid]->th.th_task_team = NULL; 6198 } 6199 6200 KA_TRACE(10, 6201 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6202 gtid)); 6203 return; 6204 } 6205 } 6206 #if KMP_DYNAMIC_LIB 6207 if (__kmp_pause_status != kmp_hard_paused) 6208 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6209 // because we will better shutdown later in the library destructor. 6210 { 6211 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6212 return; 6213 } 6214 #endif 6215 /* synchronize the termination process */ 6216 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6217 6218 /* have we already finished */ 6219 if (__kmp_global.g.g_abort) { 6220 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6221 /* TODO abort? */ 6222 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6223 return; 6224 } 6225 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6226 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6227 return; 6228 } 6229 6230 /* We need this lock to enforce mutex between this reading of 6231 __kmp_threads_capacity and the writing by __kmp_register_root. 6232 Alternatively, we can use a counter of roots that is atomically updated by 6233 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6234 __kmp_internal_end_*. */ 6235 6236 /* should we finish the run-time? are all siblings done? */ 6237 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6238 6239 for (i = 0; i < __kmp_threads_capacity; ++i) { 6240 if (KMP_UBER_GTID(i)) { 6241 KA_TRACE( 6242 10, 6243 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6244 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6245 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6246 return; 6247 } 6248 } 6249 6250 /* now we can safely conduct the actual termination */ 6251 6252 __kmp_internal_end(); 6253 6254 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6255 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6256 6257 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6258 6259 #ifdef DUMP_DEBUG_ON_EXIT 6260 if (__kmp_debug_buf) 6261 __kmp_dump_debug_buffer(); 6262 #endif 6263 } // __kmp_internal_end_thread 6264 6265 // ----------------------------------------------------------------------------- 6266 // Library registration stuff. 6267 6268 static long __kmp_registration_flag = 0; 6269 // Random value used to indicate library initialization. 6270 static char *__kmp_registration_str = NULL; 6271 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6272 6273 static inline char *__kmp_reg_status_name() { 6274 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6275 each thread. If registration and unregistration go in different threads 6276 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6277 env var can not be found, because the name will contain different pid. */ 6278 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6279 } // __kmp_reg_status_get 6280 6281 void __kmp_register_library_startup(void) { 6282 6283 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6284 int done = 0; 6285 union { 6286 double dtime; 6287 long ltime; 6288 } time; 6289 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6290 __kmp_initialize_system_tick(); 6291 #endif 6292 __kmp_read_system_time(&time.dtime); 6293 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6294 __kmp_registration_str = 6295 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6296 __kmp_registration_flag, KMP_LIBRARY_FILE); 6297 6298 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6299 __kmp_registration_str)); 6300 6301 while (!done) { 6302 6303 char *value = NULL; // Actual value of the environment variable. 6304 6305 // Set environment variable, but do not overwrite if it is exist. 6306 __kmp_env_set(name, __kmp_registration_str, 0); 6307 // Check the variable is written. 6308 value = __kmp_env_get(name); 6309 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6310 6311 done = 1; // Ok, environment variable set successfully, exit the loop. 6312 6313 } else { 6314 6315 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6316 // Check whether it alive or dead. 6317 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6318 char *tail = value; 6319 char *flag_addr_str = NULL; 6320 char *flag_val_str = NULL; 6321 char const *file_name = NULL; 6322 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6323 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6324 file_name = tail; 6325 if (tail != NULL) { 6326 long *flag_addr = 0; 6327 long flag_val = 0; 6328 KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr)); 6329 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6330 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6331 // First, check whether environment-encoded address is mapped into 6332 // addr space. 6333 // If so, dereference it to see if it still has the right value. 6334 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6335 neighbor = 1; 6336 } else { 6337 // If not, then we know the other copy of the library is no longer 6338 // running. 6339 neighbor = 2; 6340 } 6341 } 6342 } 6343 switch (neighbor) { 6344 case 0: // Cannot parse environment variable -- neighbor status unknown. 6345 // Assume it is the incompatible format of future version of the 6346 // library. Assume the other library is alive. 6347 // WARN( ... ); // TODO: Issue a warning. 6348 file_name = "unknown library"; 6349 KMP_FALLTHROUGH(); 6350 // Attention! Falling to the next case. That's intentional. 6351 case 1: { // Neighbor is alive. 6352 // Check it is allowed. 6353 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6354 if (!__kmp_str_match_true(duplicate_ok)) { 6355 // That's not allowed. Issue fatal error. 6356 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6357 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6358 } 6359 KMP_INTERNAL_FREE(duplicate_ok); 6360 __kmp_duplicate_library_ok = 1; 6361 done = 1; // Exit the loop. 6362 } break; 6363 case 2: { // Neighbor is dead. 6364 // Clear the variable and try to register library again. 6365 __kmp_env_unset(name); 6366 } break; 6367 default: { KMP_DEBUG_ASSERT(0); } break; 6368 } 6369 } 6370 KMP_INTERNAL_FREE((void *)value); 6371 } 6372 KMP_INTERNAL_FREE((void *)name); 6373 6374 } // func __kmp_register_library_startup 6375 6376 void __kmp_unregister_library(void) { 6377 6378 char *name = __kmp_reg_status_name(); 6379 char *value = __kmp_env_get(name); 6380 6381 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6382 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6383 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6384 // Ok, this is our variable. Delete it. 6385 __kmp_env_unset(name); 6386 } 6387 6388 KMP_INTERNAL_FREE(__kmp_registration_str); 6389 KMP_INTERNAL_FREE(value); 6390 KMP_INTERNAL_FREE(name); 6391 6392 __kmp_registration_flag = 0; 6393 __kmp_registration_str = NULL; 6394 6395 } // __kmp_unregister_library 6396 6397 // End of Library registration stuff. 6398 // ----------------------------------------------------------------------------- 6399 6400 #if KMP_MIC_SUPPORTED 6401 6402 static void __kmp_check_mic_type() { 6403 kmp_cpuid_t cpuid_state = {0}; 6404 kmp_cpuid_t *cs_p = &cpuid_state; 6405 __kmp_x86_cpuid(1, 0, cs_p); 6406 // We don't support mic1 at the moment 6407 if ((cs_p->eax & 0xff0) == 0xB10) { 6408 __kmp_mic_type = mic2; 6409 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6410 __kmp_mic_type = mic3; 6411 } else { 6412 __kmp_mic_type = non_mic; 6413 } 6414 } 6415 6416 #endif /* KMP_MIC_SUPPORTED */ 6417 6418 static void __kmp_do_serial_initialize(void) { 6419 int i, gtid; 6420 int size; 6421 6422 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6423 6424 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6425 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6426 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6427 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6428 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6429 6430 #if OMPT_SUPPORT 6431 ompt_pre_init(); 6432 #endif 6433 6434 __kmp_validate_locks(); 6435 6436 /* Initialize internal memory allocator */ 6437 __kmp_init_allocator(); 6438 6439 /* Register the library startup via an environment variable and check to see 6440 whether another copy of the library is already registered. */ 6441 6442 __kmp_register_library_startup(); 6443 6444 /* TODO reinitialization of library */ 6445 if (TCR_4(__kmp_global.g.g_done)) { 6446 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6447 } 6448 6449 __kmp_global.g.g_abort = 0; 6450 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6451 6452 /* initialize the locks */ 6453 #if KMP_USE_ADAPTIVE_LOCKS 6454 #if KMP_DEBUG_ADAPTIVE_LOCKS 6455 __kmp_init_speculative_stats(); 6456 #endif 6457 #endif 6458 #if KMP_STATS_ENABLED 6459 __kmp_stats_init(); 6460 #endif 6461 __kmp_init_lock(&__kmp_global_lock); 6462 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6463 __kmp_init_lock(&__kmp_debug_lock); 6464 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6465 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6466 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6467 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6468 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6469 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6470 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6471 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6472 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6473 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6474 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6475 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6476 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6477 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6478 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6479 #if KMP_USE_MONITOR 6480 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6481 #endif 6482 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6483 6484 /* conduct initialization and initial setup of configuration */ 6485 6486 __kmp_runtime_initialize(); 6487 6488 #if KMP_MIC_SUPPORTED 6489 __kmp_check_mic_type(); 6490 #endif 6491 6492 // Some global variable initialization moved here from kmp_env_initialize() 6493 #ifdef KMP_DEBUG 6494 kmp_diag = 0; 6495 #endif 6496 __kmp_abort_delay = 0; 6497 6498 // From __kmp_init_dflt_team_nth() 6499 /* assume the entire machine will be used */ 6500 __kmp_dflt_team_nth_ub = __kmp_xproc; 6501 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6502 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6503 } 6504 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6505 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6506 } 6507 __kmp_max_nth = __kmp_sys_max_nth; 6508 __kmp_cg_max_nth = __kmp_sys_max_nth; 6509 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6510 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6511 __kmp_teams_max_nth = __kmp_sys_max_nth; 6512 } 6513 6514 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6515 // part 6516 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6517 #if KMP_USE_MONITOR 6518 __kmp_monitor_wakeups = 6519 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6520 __kmp_bt_intervals = 6521 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6522 #endif 6523 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6524 __kmp_library = library_throughput; 6525 // From KMP_SCHEDULE initialization 6526 __kmp_static = kmp_sch_static_balanced; 6527 // AC: do not use analytical here, because it is non-monotonous 6528 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6529 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6530 // need to repeat assignment 6531 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6532 // bit control and barrier method control parts 6533 #if KMP_FAST_REDUCTION_BARRIER 6534 #define kmp_reduction_barrier_gather_bb ((int)1) 6535 #define kmp_reduction_barrier_release_bb ((int)1) 6536 #define kmp_reduction_barrier_gather_pat bp_hyper_bar 6537 #define kmp_reduction_barrier_release_pat bp_hyper_bar 6538 #endif // KMP_FAST_REDUCTION_BARRIER 6539 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6540 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6541 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6542 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6543 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6544 #if KMP_FAST_REDUCTION_BARRIER 6545 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6546 // lin_64 ): hyper,1 6547 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6548 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6549 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6550 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6551 } 6552 #endif // KMP_FAST_REDUCTION_BARRIER 6553 } 6554 #if KMP_FAST_REDUCTION_BARRIER 6555 #undef kmp_reduction_barrier_release_pat 6556 #undef kmp_reduction_barrier_gather_pat 6557 #undef kmp_reduction_barrier_release_bb 6558 #undef kmp_reduction_barrier_gather_bb 6559 #endif // KMP_FAST_REDUCTION_BARRIER 6560 #if KMP_MIC_SUPPORTED 6561 if (__kmp_mic_type == mic2) { // KNC 6562 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 6563 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 6564 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 6565 1; // forkjoin release 6566 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6567 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6568 } 6569 #if KMP_FAST_REDUCTION_BARRIER 6570 if (__kmp_mic_type == mic2) { // KNC 6571 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6572 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6573 } 6574 #endif // KMP_FAST_REDUCTION_BARRIER 6575 #endif // KMP_MIC_SUPPORTED 6576 6577 // From KMP_CHECKS initialization 6578 #ifdef KMP_DEBUG 6579 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 6580 #else 6581 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 6582 #endif 6583 6584 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 6585 __kmp_foreign_tp = TRUE; 6586 6587 __kmp_global.g.g_dynamic = FALSE; 6588 __kmp_global.g.g_dynamic_mode = dynamic_default; 6589 6590 __kmp_env_initialize(NULL); 6591 6592 // Print all messages in message catalog for testing purposes. 6593 #ifdef KMP_DEBUG 6594 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 6595 if (__kmp_str_match_true(val)) { 6596 kmp_str_buf_t buffer; 6597 __kmp_str_buf_init(&buffer); 6598 __kmp_i18n_dump_catalog(&buffer); 6599 __kmp_printf("%s", buffer.str); 6600 __kmp_str_buf_free(&buffer); 6601 } 6602 __kmp_env_free(&val); 6603 #endif 6604 6605 __kmp_threads_capacity = 6606 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 6607 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 6608 __kmp_tp_capacity = __kmp_default_tp_capacity( 6609 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 6610 6611 // If the library is shut down properly, both pools must be NULL. Just in 6612 // case, set them to NULL -- some memory may leak, but subsequent code will 6613 // work even if pools are not freed. 6614 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 6615 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 6616 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 6617 __kmp_thread_pool = NULL; 6618 __kmp_thread_pool_insert_pt = NULL; 6619 __kmp_team_pool = NULL; 6620 6621 /* Allocate all of the variable sized records */ 6622 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 6623 * expandable */ 6624 /* Since allocation is cache-aligned, just add extra padding at the end */ 6625 size = 6626 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 6627 CACHE_LINE; 6628 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 6629 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 6630 sizeof(kmp_info_t *) * __kmp_threads_capacity); 6631 6632 /* init thread counts */ 6633 KMP_DEBUG_ASSERT(__kmp_all_nth == 6634 0); // Asserts fail if the library is reinitializing and 6635 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 6636 __kmp_all_nth = 0; 6637 __kmp_nth = 0; 6638 6639 /* setup the uber master thread and hierarchy */ 6640 gtid = __kmp_register_root(TRUE); 6641 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 6642 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6643 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 6644 6645 KMP_MB(); /* Flush all pending memory write invalidates. */ 6646 6647 __kmp_common_initialize(); 6648 6649 #if KMP_OS_UNIX 6650 /* invoke the child fork handler */ 6651 __kmp_register_atfork(); 6652 #endif 6653 6654 #if !KMP_DYNAMIC_LIB 6655 { 6656 /* Invoke the exit handler when the program finishes, only for static 6657 library. For dynamic library, we already have _fini and DllMain. */ 6658 int rc = atexit(__kmp_internal_end_atexit); 6659 if (rc != 0) { 6660 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 6661 __kmp_msg_null); 6662 } 6663 } 6664 #endif 6665 6666 #if KMP_HANDLE_SIGNALS 6667 #if KMP_OS_UNIX 6668 /* NOTE: make sure that this is called before the user installs their own 6669 signal handlers so that the user handlers are called first. this way they 6670 can return false, not call our handler, avoid terminating the library, and 6671 continue execution where they left off. */ 6672 __kmp_install_signals(FALSE); 6673 #endif /* KMP_OS_UNIX */ 6674 #if KMP_OS_WINDOWS 6675 __kmp_install_signals(TRUE); 6676 #endif /* KMP_OS_WINDOWS */ 6677 #endif 6678 6679 /* we have finished the serial initialization */ 6680 __kmp_init_counter++; 6681 6682 __kmp_init_serial = TRUE; 6683 6684 if (__kmp_settings) { 6685 __kmp_env_print(); 6686 } 6687 6688 if (__kmp_display_env || __kmp_display_env_verbose) { 6689 __kmp_env_print_2(); 6690 } 6691 6692 #if OMPT_SUPPORT 6693 ompt_post_init(); 6694 #endif 6695 6696 KMP_MB(); 6697 6698 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 6699 } 6700 6701 void __kmp_serial_initialize(void) { 6702 if (__kmp_init_serial) { 6703 return; 6704 } 6705 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6706 if (__kmp_init_serial) { 6707 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6708 return; 6709 } 6710 __kmp_do_serial_initialize(); 6711 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6712 } 6713 6714 static void __kmp_do_middle_initialize(void) { 6715 int i, j; 6716 int prev_dflt_team_nth; 6717 6718 if (!__kmp_init_serial) { 6719 __kmp_do_serial_initialize(); 6720 } 6721 6722 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 6723 6724 // Save the previous value for the __kmp_dflt_team_nth so that 6725 // we can avoid some reinitialization if it hasn't changed. 6726 prev_dflt_team_nth = __kmp_dflt_team_nth; 6727 6728 #if KMP_AFFINITY_SUPPORTED 6729 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 6730 // number of cores on the machine. 6731 __kmp_affinity_initialize(); 6732 6733 // Run through the __kmp_threads array and set the affinity mask 6734 // for each root thread that is currently registered with the RTL. 6735 for (i = 0; i < __kmp_threads_capacity; i++) { 6736 if (TCR_PTR(__kmp_threads[i]) != NULL) { 6737 __kmp_affinity_set_init_mask(i, TRUE); 6738 } 6739 } 6740 #endif /* KMP_AFFINITY_SUPPORTED */ 6741 6742 KMP_ASSERT(__kmp_xproc > 0); 6743 if (__kmp_avail_proc == 0) { 6744 __kmp_avail_proc = __kmp_xproc; 6745 } 6746 6747 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 6748 // correct them now 6749 j = 0; 6750 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 6751 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 6752 __kmp_avail_proc; 6753 j++; 6754 } 6755 6756 if (__kmp_dflt_team_nth == 0) { 6757 #ifdef KMP_DFLT_NTH_CORES 6758 // Default #threads = #cores 6759 __kmp_dflt_team_nth = __kmp_ncores; 6760 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 6761 "__kmp_ncores (%d)\n", 6762 __kmp_dflt_team_nth)); 6763 #else 6764 // Default #threads = #available OS procs 6765 __kmp_dflt_team_nth = __kmp_avail_proc; 6766 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 6767 "__kmp_avail_proc(%d)\n", 6768 __kmp_dflt_team_nth)); 6769 #endif /* KMP_DFLT_NTH_CORES */ 6770 } 6771 6772 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 6773 __kmp_dflt_team_nth = KMP_MIN_NTH; 6774 } 6775 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 6776 __kmp_dflt_team_nth = __kmp_sys_max_nth; 6777 } 6778 6779 // There's no harm in continuing if the following check fails, 6780 // but it indicates an error in the previous logic. 6781 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 6782 6783 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 6784 // Run through the __kmp_threads array and set the num threads icv for each 6785 // root thread that is currently registered with the RTL (which has not 6786 // already explicitly set its nthreads-var with a call to 6787 // omp_set_num_threads()). 6788 for (i = 0; i < __kmp_threads_capacity; i++) { 6789 kmp_info_t *thread = __kmp_threads[i]; 6790 if (thread == NULL) 6791 continue; 6792 if (thread->th.th_current_task->td_icvs.nproc != 0) 6793 continue; 6794 6795 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 6796 } 6797 } 6798 KA_TRACE( 6799 20, 6800 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 6801 __kmp_dflt_team_nth)); 6802 6803 #ifdef KMP_ADJUST_BLOCKTIME 6804 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 6805 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6806 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6807 if (__kmp_nth > __kmp_avail_proc) { 6808 __kmp_zero_bt = TRUE; 6809 } 6810 } 6811 #endif /* KMP_ADJUST_BLOCKTIME */ 6812 6813 /* we have finished middle initialization */ 6814 TCW_SYNC_4(__kmp_init_middle, TRUE); 6815 6816 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 6817 } 6818 6819 void __kmp_middle_initialize(void) { 6820 if (__kmp_init_middle) { 6821 return; 6822 } 6823 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6824 if (__kmp_init_middle) { 6825 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6826 return; 6827 } 6828 __kmp_do_middle_initialize(); 6829 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6830 } 6831 6832 void __kmp_parallel_initialize(void) { 6833 int gtid = __kmp_entry_gtid(); // this might be a new root 6834 6835 /* synchronize parallel initialization (for sibling) */ 6836 if (TCR_4(__kmp_init_parallel)) 6837 return; 6838 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6839 if (TCR_4(__kmp_init_parallel)) { 6840 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6841 return; 6842 } 6843 6844 /* TODO reinitialization after we have already shut down */ 6845 if (TCR_4(__kmp_global.g.g_done)) { 6846 KA_TRACE( 6847 10, 6848 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 6849 __kmp_infinite_loop(); 6850 } 6851 6852 /* jc: The lock __kmp_initz_lock is already held, so calling 6853 __kmp_serial_initialize would cause a deadlock. So we call 6854 __kmp_do_serial_initialize directly. */ 6855 if (!__kmp_init_middle) { 6856 __kmp_do_middle_initialize(); 6857 } 6858 __kmp_resume_if_hard_paused(); 6859 6860 /* begin initialization */ 6861 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 6862 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6863 6864 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6865 // Save the FP control regs. 6866 // Worker threads will set theirs to these values at thread startup. 6867 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 6868 __kmp_store_mxcsr(&__kmp_init_mxcsr); 6869 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 6870 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 6871 6872 #if KMP_OS_UNIX 6873 #if KMP_HANDLE_SIGNALS 6874 /* must be after __kmp_serial_initialize */ 6875 __kmp_install_signals(TRUE); 6876 #endif 6877 #endif 6878 6879 __kmp_suspend_initialize(); 6880 6881 #if defined(USE_LOAD_BALANCE) 6882 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 6883 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 6884 } 6885 #else 6886 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 6887 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 6888 } 6889 #endif 6890 6891 if (__kmp_version) { 6892 __kmp_print_version_2(); 6893 } 6894 6895 /* we have finished parallel initialization */ 6896 TCW_SYNC_4(__kmp_init_parallel, TRUE); 6897 6898 KMP_MB(); 6899 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 6900 6901 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6902 } 6903 6904 /* ------------------------------------------------------------------------ */ 6905 6906 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 6907 kmp_team_t *team) { 6908 kmp_disp_t *dispatch; 6909 6910 KMP_MB(); 6911 6912 /* none of the threads have encountered any constructs, yet. */ 6913 this_thr->th.th_local.this_construct = 0; 6914 #if KMP_CACHE_MANAGE 6915 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 6916 #endif /* KMP_CACHE_MANAGE */ 6917 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 6918 KMP_DEBUG_ASSERT(dispatch); 6919 KMP_DEBUG_ASSERT(team->t.t_dispatch); 6920 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 6921 // this_thr->th.th_info.ds.ds_tid ] ); 6922 6923 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 6924 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 6925 if (__kmp_env_consistency_check) 6926 __kmp_push_parallel(gtid, team->t.t_ident); 6927 6928 KMP_MB(); /* Flush all pending memory write invalidates. */ 6929 } 6930 6931 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 6932 kmp_team_t *team) { 6933 if (__kmp_env_consistency_check) 6934 __kmp_pop_parallel(gtid, team->t.t_ident); 6935 6936 __kmp_finish_implicit_task(this_thr); 6937 } 6938 6939 int __kmp_invoke_task_func(int gtid) { 6940 int rc; 6941 int tid = __kmp_tid_from_gtid(gtid); 6942 kmp_info_t *this_thr = __kmp_threads[gtid]; 6943 kmp_team_t *team = this_thr->th.th_team; 6944 6945 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 6946 #if USE_ITT_BUILD 6947 if (__itt_stack_caller_create_ptr) { 6948 __kmp_itt_stack_callee_enter( 6949 (__itt_caller) 6950 team->t.t_stack_id); // inform ittnotify about entering user's code 6951 } 6952 #endif /* USE_ITT_BUILD */ 6953 #if INCLUDE_SSC_MARKS 6954 SSC_MARK_INVOKING(); 6955 #endif 6956 6957 #if OMPT_SUPPORT 6958 void *dummy; 6959 void **exit_runtime_p; 6960 ompt_data_t *my_task_data; 6961 ompt_data_t *my_parallel_data; 6962 int ompt_team_size; 6963 6964 if (ompt_enabled.enabled) { 6965 exit_runtime_p = &( 6966 team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr); 6967 } else { 6968 exit_runtime_p = &dummy; 6969 } 6970 6971 my_task_data = 6972 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 6973 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 6974 if (ompt_enabled.ompt_callback_implicit_task) { 6975 ompt_team_size = team->t.t_nproc; 6976 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 6977 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 6978 __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial? 6979 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 6980 } 6981 #endif 6982 6983 #if KMP_STATS_ENABLED 6984 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 6985 if (previous_state == stats_state_e::TEAMS_REGION) { 6986 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 6987 } else { 6988 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 6989 } 6990 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 6991 #endif 6992 6993 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 6994 tid, (int)team->t.t_argc, (void **)team->t.t_argv 6995 #if OMPT_SUPPORT 6996 , 6997 exit_runtime_p 6998 #endif 6999 ); 7000 #if OMPT_SUPPORT 7001 *exit_runtime_p = NULL; 7002 #endif 7003 7004 #if KMP_STATS_ENABLED 7005 if (previous_state == stats_state_e::TEAMS_REGION) { 7006 KMP_SET_THREAD_STATE(previous_state); 7007 } 7008 KMP_POP_PARTITIONED_TIMER(); 7009 #endif 7010 7011 #if USE_ITT_BUILD 7012 if (__itt_stack_caller_create_ptr) { 7013 __kmp_itt_stack_callee_leave( 7014 (__itt_caller) 7015 team->t.t_stack_id); // inform ittnotify about leaving user's code 7016 } 7017 #endif /* USE_ITT_BUILD */ 7018 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7019 7020 return rc; 7021 } 7022 7023 void __kmp_teams_master(int gtid) { 7024 // This routine is called by all master threads in teams construct 7025 kmp_info_t *thr = __kmp_threads[gtid]; 7026 kmp_team_t *team = thr->th.th_team; 7027 ident_t *loc = team->t.t_ident; 7028 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7029 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7030 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7031 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7032 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7033 7034 // This thread is a new CG root. Set up the proper variables. 7035 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7036 tmp->cg_root = thr; // Make thr the CG root 7037 // Init to thread limit that was stored when league masters were forked 7038 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7039 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7040 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7041 " cg_nthreads to 1\n", 7042 thr, tmp)); 7043 tmp->up = thr->th.th_cg_roots; 7044 thr->th.th_cg_roots = tmp; 7045 7046 // Launch league of teams now, but not let workers execute 7047 // (they hang on fork barrier until next parallel) 7048 #if INCLUDE_SSC_MARKS 7049 SSC_MARK_FORKING(); 7050 #endif 7051 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7052 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7053 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7054 #if INCLUDE_SSC_MARKS 7055 SSC_MARK_JOINING(); 7056 #endif 7057 // If the team size was reduced from the limit, set it to the new size 7058 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7059 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7060 // AC: last parameter "1" eliminates join barrier which won't work because 7061 // worker threads are in a fork barrier waiting for more parallel regions 7062 __kmp_join_call(loc, gtid 7063 #if OMPT_SUPPORT 7064 , 7065 fork_context_intel 7066 #endif 7067 , 7068 1); 7069 } 7070 7071 int __kmp_invoke_teams_master(int gtid) { 7072 kmp_info_t *this_thr = __kmp_threads[gtid]; 7073 kmp_team_t *team = this_thr->th.th_team; 7074 #if KMP_DEBUG 7075 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7076 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7077 (void *)__kmp_teams_master); 7078 #endif 7079 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7080 __kmp_teams_master(gtid); 7081 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7082 return 1; 7083 } 7084 7085 /* this sets the requested number of threads for the next parallel region 7086 encountered by this team. since this should be enclosed in the forkjoin 7087 critical section it should avoid race conditions with assymmetrical nested 7088 parallelism */ 7089 7090 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7091 kmp_info_t *thr = __kmp_threads[gtid]; 7092 7093 if (num_threads > 0) 7094 thr->th.th_set_nproc = num_threads; 7095 } 7096 7097 /* this sets the requested number of teams for the teams region and/or 7098 the number of threads for the next parallel region encountered */ 7099 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7100 int num_threads) { 7101 kmp_info_t *thr = __kmp_threads[gtid]; 7102 KMP_DEBUG_ASSERT(num_teams >= 0); 7103 KMP_DEBUG_ASSERT(num_threads >= 0); 7104 7105 if (num_teams == 0) 7106 num_teams = 1; // default number of teams is 1. 7107 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7108 if (!__kmp_reserve_warn) { 7109 __kmp_reserve_warn = 1; 7110 __kmp_msg(kmp_ms_warning, 7111 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7112 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7113 } 7114 num_teams = __kmp_teams_max_nth; 7115 } 7116 // Set number of teams (number of threads in the outer "parallel" of the 7117 // teams) 7118 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7119 7120 // Remember the number of threads for inner parallel regions 7121 if (num_threads == 0) { 7122 if (!TCR_4(__kmp_init_middle)) 7123 __kmp_middle_initialize(); // get __kmp_avail_proc calculated 7124 num_threads = __kmp_avail_proc / num_teams; 7125 if (num_teams * num_threads > __kmp_teams_max_nth) { 7126 // adjust num_threads w/o warning as it is not user setting 7127 num_threads = __kmp_teams_max_nth / num_teams; 7128 } 7129 } else { 7130 // This thread will be the master of the league masters 7131 // Store new thread limit; old limit is saved in th_cg_roots list 7132 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7133 7134 if (num_teams * num_threads > __kmp_teams_max_nth) { 7135 int new_threads = __kmp_teams_max_nth / num_teams; 7136 if (!__kmp_reserve_warn) { // user asked for too many threads 7137 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7138 __kmp_msg(kmp_ms_warning, 7139 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7140 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7141 } 7142 num_threads = new_threads; 7143 } 7144 } 7145 thr->th.th_teams_size.nth = num_threads; 7146 } 7147 7148 // Set the proc_bind var to use in the following parallel region. 7149 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7150 kmp_info_t *thr = __kmp_threads[gtid]; 7151 thr->th.th_set_proc_bind = proc_bind; 7152 } 7153 7154 /* Launch the worker threads into the microtask. */ 7155 7156 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7157 kmp_info_t *this_thr = __kmp_threads[gtid]; 7158 7159 #ifdef KMP_DEBUG 7160 int f; 7161 #endif /* KMP_DEBUG */ 7162 7163 KMP_DEBUG_ASSERT(team); 7164 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7165 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7166 KMP_MB(); /* Flush all pending memory write invalidates. */ 7167 7168 team->t.t_construct = 0; /* no single directives seen yet */ 7169 team->t.t_ordered.dt.t_value = 7170 0; /* thread 0 enters the ordered section first */ 7171 7172 /* Reset the identifiers on the dispatch buffer */ 7173 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7174 if (team->t.t_max_nproc > 1) { 7175 int i; 7176 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7177 team->t.t_disp_buffer[i].buffer_index = i; 7178 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7179 } 7180 } else { 7181 team->t.t_disp_buffer[0].buffer_index = 0; 7182 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7183 } 7184 7185 KMP_MB(); /* Flush all pending memory write invalidates. */ 7186 KMP_ASSERT(this_thr->th.th_team == team); 7187 7188 #ifdef KMP_DEBUG 7189 for (f = 0; f < team->t.t_nproc; f++) { 7190 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7191 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7192 } 7193 #endif /* KMP_DEBUG */ 7194 7195 /* release the worker threads so they may begin working */ 7196 __kmp_fork_barrier(gtid, 0); 7197 } 7198 7199 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7200 kmp_info_t *this_thr = __kmp_threads[gtid]; 7201 7202 KMP_DEBUG_ASSERT(team); 7203 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7204 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7205 KMP_MB(); /* Flush all pending memory write invalidates. */ 7206 7207 /* Join barrier after fork */ 7208 7209 #ifdef KMP_DEBUG 7210 if (__kmp_threads[gtid] && 7211 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7212 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7213 __kmp_threads[gtid]); 7214 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7215 "team->t.t_nproc=%d\n", 7216 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7217 team->t.t_nproc); 7218 __kmp_print_structure(); 7219 } 7220 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7221 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7222 #endif /* KMP_DEBUG */ 7223 7224 __kmp_join_barrier(gtid); /* wait for everyone */ 7225 #if OMPT_SUPPORT 7226 if (ompt_enabled.enabled && 7227 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7228 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7229 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7230 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7231 #if OMPT_OPTIONAL 7232 void *codeptr = NULL; 7233 if (KMP_MASTER_TID(ds_tid) && 7234 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7235 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7236 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7237 7238 if (ompt_enabled.ompt_callback_sync_region_wait) { 7239 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7240 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7241 codeptr); 7242 } 7243 if (ompt_enabled.ompt_callback_sync_region) { 7244 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7245 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7246 codeptr); 7247 } 7248 #endif 7249 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7250 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7251 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7252 } 7253 } 7254 #endif 7255 7256 KMP_MB(); /* Flush all pending memory write invalidates. */ 7257 KMP_ASSERT(this_thr->th.th_team == team); 7258 } 7259 7260 /* ------------------------------------------------------------------------ */ 7261 7262 #ifdef USE_LOAD_BALANCE 7263 7264 // Return the worker threads actively spinning in the hot team, if we 7265 // are at the outermost level of parallelism. Otherwise, return 0. 7266 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7267 int i; 7268 int retval; 7269 kmp_team_t *hot_team; 7270 7271 if (root->r.r_active) { 7272 return 0; 7273 } 7274 hot_team = root->r.r_hot_team; 7275 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7276 return hot_team->t.t_nproc - 1; // Don't count master thread 7277 } 7278 7279 // Skip the master thread - it is accounted for elsewhere. 7280 retval = 0; 7281 for (i = 1; i < hot_team->t.t_nproc; i++) { 7282 if (hot_team->t.t_threads[i]->th.th_active) { 7283 retval++; 7284 } 7285 } 7286 return retval; 7287 } 7288 7289 // Perform an automatic adjustment to the number of 7290 // threads used by the next parallel region. 7291 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7292 int retval; 7293 int pool_active; 7294 int hot_team_active; 7295 int team_curr_active; 7296 int system_active; 7297 7298 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7299 set_nproc)); 7300 KMP_DEBUG_ASSERT(root); 7301 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7302 ->th.th_current_task->td_icvs.dynamic == TRUE); 7303 KMP_DEBUG_ASSERT(set_nproc > 1); 7304 7305 if (set_nproc == 1) { 7306 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7307 return 1; 7308 } 7309 7310 // Threads that are active in the thread pool, active in the hot team for this 7311 // particular root (if we are at the outer par level), and the currently 7312 // executing thread (to become the master) are available to add to the new 7313 // team, but are currently contributing to the system load, and must be 7314 // accounted for. 7315 pool_active = __kmp_thread_pool_active_nth; 7316 hot_team_active = __kmp_active_hot_team_nproc(root); 7317 team_curr_active = pool_active + hot_team_active + 1; 7318 7319 // Check the system load. 7320 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7321 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7322 "hot team active = %d\n", 7323 system_active, pool_active, hot_team_active)); 7324 7325 if (system_active < 0) { 7326 // There was an error reading the necessary info from /proc, so use the 7327 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7328 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7329 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7330 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7331 7332 // Make this call behave like the thread limit algorithm. 7333 retval = __kmp_avail_proc - __kmp_nth + 7334 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7335 if (retval > set_nproc) { 7336 retval = set_nproc; 7337 } 7338 if (retval < KMP_MIN_NTH) { 7339 retval = KMP_MIN_NTH; 7340 } 7341 7342 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7343 retval)); 7344 return retval; 7345 } 7346 7347 // There is a slight delay in the load balance algorithm in detecting new 7348 // running procs. The real system load at this instant should be at least as 7349 // large as the #active omp thread that are available to add to the team. 7350 if (system_active < team_curr_active) { 7351 system_active = team_curr_active; 7352 } 7353 retval = __kmp_avail_proc - system_active + team_curr_active; 7354 if (retval > set_nproc) { 7355 retval = set_nproc; 7356 } 7357 if (retval < KMP_MIN_NTH) { 7358 retval = KMP_MIN_NTH; 7359 } 7360 7361 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7362 return retval; 7363 } // __kmp_load_balance_nproc() 7364 7365 #endif /* USE_LOAD_BALANCE */ 7366 7367 /* ------------------------------------------------------------------------ */ 7368 7369 /* NOTE: this is called with the __kmp_init_lock held */ 7370 void __kmp_cleanup(void) { 7371 int f; 7372 7373 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7374 7375 if (TCR_4(__kmp_init_parallel)) { 7376 #if KMP_HANDLE_SIGNALS 7377 __kmp_remove_signals(); 7378 #endif 7379 TCW_4(__kmp_init_parallel, FALSE); 7380 } 7381 7382 if (TCR_4(__kmp_init_middle)) { 7383 #if KMP_AFFINITY_SUPPORTED 7384 __kmp_affinity_uninitialize(); 7385 #endif /* KMP_AFFINITY_SUPPORTED */ 7386 __kmp_cleanup_hierarchy(); 7387 TCW_4(__kmp_init_middle, FALSE); 7388 } 7389 7390 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7391 7392 if (__kmp_init_serial) { 7393 __kmp_runtime_destroy(); 7394 __kmp_init_serial = FALSE; 7395 } 7396 7397 __kmp_cleanup_threadprivate_caches(); 7398 7399 for (f = 0; f < __kmp_threads_capacity; f++) { 7400 if (__kmp_root[f] != NULL) { 7401 __kmp_free(__kmp_root[f]); 7402 __kmp_root[f] = NULL; 7403 } 7404 } 7405 __kmp_free(__kmp_threads); 7406 // __kmp_threads and __kmp_root were allocated at once, as single block, so 7407 // there is no need in freeing __kmp_root. 7408 __kmp_threads = NULL; 7409 __kmp_root = NULL; 7410 __kmp_threads_capacity = 0; 7411 7412 #if KMP_USE_DYNAMIC_LOCK 7413 __kmp_cleanup_indirect_user_locks(); 7414 #else 7415 __kmp_cleanup_user_locks(); 7416 #endif 7417 7418 #if KMP_AFFINITY_SUPPORTED 7419 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 7420 __kmp_cpuinfo_file = NULL; 7421 #endif /* KMP_AFFINITY_SUPPORTED */ 7422 7423 #if KMP_USE_ADAPTIVE_LOCKS 7424 #if KMP_DEBUG_ADAPTIVE_LOCKS 7425 __kmp_print_speculative_stats(); 7426 #endif 7427 #endif 7428 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 7429 __kmp_nested_nth.nth = NULL; 7430 __kmp_nested_nth.size = 0; 7431 __kmp_nested_nth.used = 0; 7432 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 7433 __kmp_nested_proc_bind.bind_types = NULL; 7434 __kmp_nested_proc_bind.size = 0; 7435 __kmp_nested_proc_bind.used = 0; 7436 if (__kmp_affinity_format) { 7437 KMP_INTERNAL_FREE(__kmp_affinity_format); 7438 __kmp_affinity_format = NULL; 7439 } 7440 7441 __kmp_i18n_catclose(); 7442 7443 #if KMP_USE_HIER_SCHED 7444 __kmp_hier_scheds.deallocate(); 7445 #endif 7446 7447 #if KMP_STATS_ENABLED 7448 __kmp_stats_fini(); 7449 #endif 7450 7451 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 7452 } 7453 7454 /* ------------------------------------------------------------------------ */ 7455 7456 int __kmp_ignore_mppbeg(void) { 7457 char *env; 7458 7459 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 7460 if (__kmp_str_match_false(env)) 7461 return FALSE; 7462 } 7463 // By default __kmpc_begin() is no-op. 7464 return TRUE; 7465 } 7466 7467 int __kmp_ignore_mppend(void) { 7468 char *env; 7469 7470 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 7471 if (__kmp_str_match_false(env)) 7472 return FALSE; 7473 } 7474 // By default __kmpc_end() is no-op. 7475 return TRUE; 7476 } 7477 7478 void __kmp_internal_begin(void) { 7479 int gtid; 7480 kmp_root_t *root; 7481 7482 /* this is a very important step as it will register new sibling threads 7483 and assign these new uber threads a new gtid */ 7484 gtid = __kmp_entry_gtid(); 7485 root = __kmp_threads[gtid]->th.th_root; 7486 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7487 7488 if (root->r.r_begin) 7489 return; 7490 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 7491 if (root->r.r_begin) { 7492 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7493 return; 7494 } 7495 7496 root->r.r_begin = TRUE; 7497 7498 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7499 } 7500 7501 /* ------------------------------------------------------------------------ */ 7502 7503 void __kmp_user_set_library(enum library_type arg) { 7504 int gtid; 7505 kmp_root_t *root; 7506 kmp_info_t *thread; 7507 7508 /* first, make sure we are initialized so we can get our gtid */ 7509 7510 gtid = __kmp_entry_gtid(); 7511 thread = __kmp_threads[gtid]; 7512 7513 root = thread->th.th_root; 7514 7515 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 7516 library_serial)); 7517 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 7518 thread */ 7519 KMP_WARNING(SetLibraryIncorrectCall); 7520 return; 7521 } 7522 7523 switch (arg) { 7524 case library_serial: 7525 thread->th.th_set_nproc = 0; 7526 set__nproc(thread, 1); 7527 break; 7528 case library_turnaround: 7529 thread->th.th_set_nproc = 0; 7530 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7531 : __kmp_dflt_team_nth_ub); 7532 break; 7533 case library_throughput: 7534 thread->th.th_set_nproc = 0; 7535 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7536 : __kmp_dflt_team_nth_ub); 7537 break; 7538 default: 7539 KMP_FATAL(UnknownLibraryType, arg); 7540 } 7541 7542 __kmp_aux_set_library(arg); 7543 } 7544 7545 void __kmp_aux_set_stacksize(size_t arg) { 7546 if (!__kmp_init_serial) 7547 __kmp_serial_initialize(); 7548 7549 #if KMP_OS_DARWIN 7550 if (arg & (0x1000 - 1)) { 7551 arg &= ~(0x1000 - 1); 7552 if (arg + 0x1000) /* check for overflow if we round up */ 7553 arg += 0x1000; 7554 } 7555 #endif 7556 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7557 7558 /* only change the default stacksize before the first parallel region */ 7559 if (!TCR_4(__kmp_init_parallel)) { 7560 size_t value = arg; /* argument is in bytes */ 7561 7562 if (value < __kmp_sys_min_stksize) 7563 value = __kmp_sys_min_stksize; 7564 else if (value > KMP_MAX_STKSIZE) 7565 value = KMP_MAX_STKSIZE; 7566 7567 __kmp_stksize = value; 7568 7569 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 7570 } 7571 7572 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7573 } 7574 7575 /* set the behaviour of the runtime library */ 7576 /* TODO this can cause some odd behaviour with sibling parallelism... */ 7577 void __kmp_aux_set_library(enum library_type arg) { 7578 __kmp_library = arg; 7579 7580 switch (__kmp_library) { 7581 case library_serial: { 7582 KMP_INFORM(LibraryIsSerial); 7583 } break; 7584 case library_turnaround: 7585 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 7586 __kmp_use_yield = 2; // only yield when oversubscribed 7587 break; 7588 case library_throughput: 7589 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 7590 __kmp_dflt_blocktime = 200; 7591 break; 7592 default: 7593 KMP_FATAL(UnknownLibraryType, arg); 7594 } 7595 } 7596 7597 /* Getting team information common for all team API */ 7598 // Returns NULL if not in teams construct 7599 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 7600 kmp_info_t *thr = __kmp_entry_thread(); 7601 teams_serialized = 0; 7602 if (thr->th.th_teams_microtask) { 7603 kmp_team_t *team = thr->th.th_team; 7604 int tlevel = thr->th.th_teams_level; // the level of the teams construct 7605 int ii = team->t.t_level; 7606 teams_serialized = team->t.t_serialized; 7607 int level = tlevel + 1; 7608 KMP_DEBUG_ASSERT(ii >= tlevel); 7609 while (ii > level) { 7610 for (teams_serialized = team->t.t_serialized; 7611 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 7612 } 7613 if (team->t.t_serialized && (!teams_serialized)) { 7614 team = team->t.t_parent; 7615 continue; 7616 } 7617 if (ii > level) { 7618 team = team->t.t_parent; 7619 ii--; 7620 } 7621 } 7622 return team; 7623 } 7624 return NULL; 7625 } 7626 7627 int __kmp_aux_get_team_num() { 7628 int serialized; 7629 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 7630 if (team) { 7631 if (serialized > 1) { 7632 return 0; // teams region is serialized ( 1 team of 1 thread ). 7633 } else { 7634 return team->t.t_master_tid; 7635 } 7636 } 7637 return 0; 7638 } 7639 7640 int __kmp_aux_get_num_teams() { 7641 int serialized; 7642 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 7643 if (team) { 7644 if (serialized > 1) { 7645 return 1; 7646 } else { 7647 return team->t.t_parent->t.t_nproc; 7648 } 7649 } 7650 return 1; 7651 } 7652 7653 /* ------------------------------------------------------------------------ */ 7654 7655 /* 7656 * Affinity Format Parser 7657 * 7658 * Field is in form of: %[[[0].]size]type 7659 * % and type are required (%% means print a literal '%') 7660 * type is either single char or long name surrounded by {}, 7661 * e.g., N or {num_threads} 7662 * 0 => leading zeros 7663 * . => right justified when size is specified 7664 * by default output is left justified 7665 * size is the *minimum* field length 7666 * All other characters are printed as is 7667 * 7668 * Available field types: 7669 * L {thread_level} - omp_get_level() 7670 * n {thread_num} - omp_get_thread_num() 7671 * h {host} - name of host machine 7672 * P {process_id} - process id (integer) 7673 * T {thread_identifier} - native thread identifier (integer) 7674 * N {num_threads} - omp_get_num_threads() 7675 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 7676 * a {thread_affinity} - comma separated list of integers or integer ranges 7677 * (values of affinity mask) 7678 * 7679 * Implementation-specific field types can be added 7680 * If a type is unknown, print "undefined" 7681 */ 7682 7683 // Structure holding the short name, long name, and corresponding data type 7684 // for snprintf. A table of these will represent the entire valid keyword 7685 // field types. 7686 typedef struct kmp_affinity_format_field_t { 7687 char short_name; // from spec e.g., L -> thread level 7688 const char *long_name; // from spec thread_level -> thread level 7689 char field_format; // data type for snprintf (typically 'd' or 's' 7690 // for integer or string) 7691 } kmp_affinity_format_field_t; 7692 7693 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 7694 #if KMP_AFFINITY_SUPPORTED 7695 {'A', "thread_affinity", 's'}, 7696 #endif 7697 {'t', "team_num", 'd'}, 7698 {'T', "num_teams", 'd'}, 7699 {'L', "nesting_level", 'd'}, 7700 {'n', "thread_num", 'd'}, 7701 {'N', "num_threads", 'd'}, 7702 {'a', "ancestor_tnum", 'd'}, 7703 {'H', "host", 's'}, 7704 {'P', "process_id", 'd'}, 7705 {'i', "native_thread_id", 'd'}}; 7706 7707 // Return the number of characters it takes to hold field 7708 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 7709 const char **ptr, 7710 kmp_str_buf_t *field_buffer) { 7711 int rc, format_index, field_value; 7712 const char *width_left, *width_right; 7713 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 7714 static const int FORMAT_SIZE = 20; 7715 char format[FORMAT_SIZE] = {0}; 7716 char absolute_short_name = 0; 7717 7718 KMP_DEBUG_ASSERT(gtid >= 0); 7719 KMP_DEBUG_ASSERT(th); 7720 KMP_DEBUG_ASSERT(**ptr == '%'); 7721 KMP_DEBUG_ASSERT(field_buffer); 7722 7723 __kmp_str_buf_clear(field_buffer); 7724 7725 // Skip the initial % 7726 (*ptr)++; 7727 7728 // Check for %% first 7729 if (**ptr == '%') { 7730 __kmp_str_buf_cat(field_buffer, "%", 1); 7731 (*ptr)++; // skip over the second % 7732 return 1; 7733 } 7734 7735 // Parse field modifiers if they are present 7736 pad_zeros = false; 7737 if (**ptr == '0') { 7738 pad_zeros = true; 7739 (*ptr)++; // skip over 0 7740 } 7741 right_justify = false; 7742 if (**ptr == '.') { 7743 right_justify = true; 7744 (*ptr)++; // skip over . 7745 } 7746 // Parse width of field: [width_left, width_right) 7747 width_left = width_right = NULL; 7748 if (**ptr >= '0' && **ptr <= '9') { 7749 width_left = *ptr; 7750 SKIP_DIGITS(*ptr); 7751 width_right = *ptr; 7752 } 7753 7754 // Create the format for KMP_SNPRINTF based on flags parsed above 7755 format_index = 0; 7756 format[format_index++] = '%'; 7757 if (!right_justify) 7758 format[format_index++] = '-'; 7759 if (pad_zeros) 7760 format[format_index++] = '0'; 7761 if (width_left && width_right) { 7762 int i = 0; 7763 // Only allow 8 digit number widths. 7764 // This also prevents overflowing format variable 7765 while (i < 8 && width_left < width_right) { 7766 format[format_index++] = *width_left; 7767 width_left++; 7768 i++; 7769 } 7770 } 7771 7772 // Parse a name (long or short) 7773 // Canonicalize the name into absolute_short_name 7774 found_valid_name = false; 7775 parse_long_name = (**ptr == '{'); 7776 if (parse_long_name) 7777 (*ptr)++; // skip initial left brace 7778 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 7779 sizeof(__kmp_affinity_format_table[0]); 7780 ++i) { 7781 char short_name = __kmp_affinity_format_table[i].short_name; 7782 const char *long_name = __kmp_affinity_format_table[i].long_name; 7783 char field_format = __kmp_affinity_format_table[i].field_format; 7784 if (parse_long_name) { 7785 int length = KMP_STRLEN(long_name); 7786 if (strncmp(*ptr, long_name, length) == 0) { 7787 found_valid_name = true; 7788 (*ptr) += length; // skip the long name 7789 } 7790 } else if (**ptr == short_name) { 7791 found_valid_name = true; 7792 (*ptr)++; // skip the short name 7793 } 7794 if (found_valid_name) { 7795 format[format_index++] = field_format; 7796 format[format_index++] = '\0'; 7797 absolute_short_name = short_name; 7798 break; 7799 } 7800 } 7801 if (parse_long_name) { 7802 if (**ptr != '}') { 7803 absolute_short_name = 0; 7804 } else { 7805 (*ptr)++; // skip over the right brace 7806 } 7807 } 7808 7809 // Attempt to fill the buffer with the requested 7810 // value using snprintf within __kmp_str_buf_print() 7811 switch (absolute_short_name) { 7812 case 't': 7813 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 7814 break; 7815 case 'T': 7816 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 7817 break; 7818 case 'L': 7819 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 7820 break; 7821 case 'n': 7822 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 7823 break; 7824 case 'H': { 7825 static const int BUFFER_SIZE = 256; 7826 char buf[BUFFER_SIZE]; 7827 __kmp_expand_host_name(buf, BUFFER_SIZE); 7828 rc = __kmp_str_buf_print(field_buffer, format, buf); 7829 } break; 7830 case 'P': 7831 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 7832 break; 7833 case 'i': 7834 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 7835 break; 7836 case 'N': 7837 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 7838 break; 7839 case 'a': 7840 field_value = 7841 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 7842 rc = __kmp_str_buf_print(field_buffer, format, field_value); 7843 break; 7844 #if KMP_AFFINITY_SUPPORTED 7845 case 'A': { 7846 kmp_str_buf_t buf; 7847 __kmp_str_buf_init(&buf); 7848 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 7849 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 7850 __kmp_str_buf_free(&buf); 7851 } break; 7852 #endif 7853 default: 7854 // According to spec, If an implementation does not have info for field 7855 // type, then "undefined" is printed 7856 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 7857 // Skip the field 7858 if (parse_long_name) { 7859 SKIP_TOKEN(*ptr); 7860 if (**ptr == '}') 7861 (*ptr)++; 7862 } else { 7863 (*ptr)++; 7864 } 7865 } 7866 7867 KMP_ASSERT(format_index <= FORMAT_SIZE); 7868 return rc; 7869 } 7870 7871 /* 7872 * Return number of characters needed to hold the affinity string 7873 * (not including null byte character) 7874 * The resultant string is printed to buffer, which the caller can then 7875 * handle afterwards 7876 */ 7877 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 7878 kmp_str_buf_t *buffer) { 7879 const char *parse_ptr; 7880 size_t retval; 7881 const kmp_info_t *th; 7882 kmp_str_buf_t field; 7883 7884 KMP_DEBUG_ASSERT(buffer); 7885 KMP_DEBUG_ASSERT(gtid >= 0); 7886 7887 __kmp_str_buf_init(&field); 7888 __kmp_str_buf_clear(buffer); 7889 7890 th = __kmp_threads[gtid]; 7891 retval = 0; 7892 7893 // If format is NULL or zero-length string, then we use 7894 // affinity-format-var ICV 7895 parse_ptr = format; 7896 if (parse_ptr == NULL || *parse_ptr == '\0') { 7897 parse_ptr = __kmp_affinity_format; 7898 } 7899 KMP_DEBUG_ASSERT(parse_ptr); 7900 7901 while (*parse_ptr != '\0') { 7902 // Parse a field 7903 if (*parse_ptr == '%') { 7904 // Put field in the buffer 7905 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 7906 __kmp_str_buf_catbuf(buffer, &field); 7907 retval += rc; 7908 } else { 7909 // Put literal character in buffer 7910 __kmp_str_buf_cat(buffer, parse_ptr, 1); 7911 retval++; 7912 parse_ptr++; 7913 } 7914 } 7915 __kmp_str_buf_free(&field); 7916 return retval; 7917 } 7918 7919 // Displays the affinity string to stdout 7920 void __kmp_aux_display_affinity(int gtid, const char *format) { 7921 kmp_str_buf_t buf; 7922 __kmp_str_buf_init(&buf); 7923 __kmp_aux_capture_affinity(gtid, format, &buf); 7924 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 7925 __kmp_str_buf_free(&buf); 7926 } 7927 7928 /* ------------------------------------------------------------------------ */ 7929 7930 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 7931 int blocktime = arg; /* argument is in milliseconds */ 7932 #if KMP_USE_MONITOR 7933 int bt_intervals; 7934 #endif 7935 int bt_set; 7936 7937 __kmp_save_internal_controls(thread); 7938 7939 /* Normalize and set blocktime for the teams */ 7940 if (blocktime < KMP_MIN_BLOCKTIME) 7941 blocktime = KMP_MIN_BLOCKTIME; 7942 else if (blocktime > KMP_MAX_BLOCKTIME) 7943 blocktime = KMP_MAX_BLOCKTIME; 7944 7945 set__blocktime_team(thread->th.th_team, tid, blocktime); 7946 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 7947 7948 #if KMP_USE_MONITOR 7949 /* Calculate and set blocktime intervals for the teams */ 7950 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 7951 7952 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 7953 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 7954 #endif 7955 7956 /* Set whether blocktime has been set to "TRUE" */ 7957 bt_set = TRUE; 7958 7959 set__bt_set_team(thread->th.th_team, tid, bt_set); 7960 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 7961 #if KMP_USE_MONITOR 7962 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 7963 "bt_intervals=%d, monitor_updates=%d\n", 7964 __kmp_gtid_from_tid(tid, thread->th.th_team), 7965 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 7966 __kmp_monitor_wakeups)); 7967 #else 7968 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 7969 __kmp_gtid_from_tid(tid, thread->th.th_team), 7970 thread->th.th_team->t.t_id, tid, blocktime)); 7971 #endif 7972 } 7973 7974 void __kmp_aux_set_defaults(char const *str, int len) { 7975 if (!__kmp_init_serial) { 7976 __kmp_serial_initialize(); 7977 } 7978 __kmp_env_initialize(str); 7979 7980 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 7981 __kmp_env_print(); 7982 } 7983 } // __kmp_aux_set_defaults 7984 7985 /* ------------------------------------------------------------------------ */ 7986 /* internal fast reduction routines */ 7987 7988 PACKED_REDUCTION_METHOD_T 7989 __kmp_determine_reduction_method( 7990 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 7991 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 7992 kmp_critical_name *lck) { 7993 7994 // Default reduction method: critical construct ( lck != NULL, like in current 7995 // PAROPT ) 7996 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 7997 // can be selected by RTL 7998 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 7999 // can be selected by RTL 8000 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8001 // among generated by PAROPT. 8002 8003 PACKED_REDUCTION_METHOD_T retval; 8004 8005 int team_size; 8006 8007 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8008 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8009 8010 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8011 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 8012 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8013 8014 retval = critical_reduce_block; 8015 8016 // another choice of getting a team size (with 1 dynamic deference) is slower 8017 team_size = __kmp_get_team_num_threads(global_tid); 8018 if (team_size == 1) { 8019 8020 retval = empty_reduce_block; 8021 8022 } else { 8023 8024 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8025 8026 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 8027 8028 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8029 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8030 8031 int teamsize_cutoff = 4; 8032 8033 #if KMP_MIC_SUPPORTED 8034 if (__kmp_mic_type != non_mic) { 8035 teamsize_cutoff = 8; 8036 } 8037 #endif 8038 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8039 if (tree_available) { 8040 if (team_size <= teamsize_cutoff) { 8041 if (atomic_available) { 8042 retval = atomic_reduce_block; 8043 } 8044 } else { 8045 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8046 } 8047 } else if (atomic_available) { 8048 retval = atomic_reduce_block; 8049 } 8050 #else 8051 #error "Unknown or unsupported OS" 8052 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8053 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8054 8055 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8056 8057 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8058 8059 // basic tuning 8060 8061 if (atomic_available) { 8062 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8063 retval = atomic_reduce_block; 8064 } 8065 } // otherwise: use critical section 8066 8067 #elif KMP_OS_DARWIN 8068 8069 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8070 if (atomic_available && (num_vars <= 3)) { 8071 retval = atomic_reduce_block; 8072 } else if (tree_available) { 8073 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8074 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8075 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8076 } 8077 } // otherwise: use critical section 8078 8079 #else 8080 #error "Unknown or unsupported OS" 8081 #endif 8082 8083 #else 8084 #error "Unknown or unsupported architecture" 8085 #endif 8086 } 8087 8088 // KMP_FORCE_REDUCTION 8089 8090 // If the team is serialized (team_size == 1), ignore the forced reduction 8091 // method and stay with the unsynchronized method (empty_reduce_block) 8092 if (__kmp_force_reduction_method != reduction_method_not_defined && 8093 team_size != 1) { 8094 8095 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8096 8097 int atomic_available, tree_available; 8098 8099 switch ((forced_retval = __kmp_force_reduction_method)) { 8100 case critical_reduce_block: 8101 KMP_ASSERT(lck); // lck should be != 0 8102 break; 8103 8104 case atomic_reduce_block: 8105 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8106 if (!atomic_available) { 8107 KMP_WARNING(RedMethodNotSupported, "atomic"); 8108 forced_retval = critical_reduce_block; 8109 } 8110 break; 8111 8112 case tree_reduce_block: 8113 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8114 if (!tree_available) { 8115 KMP_WARNING(RedMethodNotSupported, "tree"); 8116 forced_retval = critical_reduce_block; 8117 } else { 8118 #if KMP_FAST_REDUCTION_BARRIER 8119 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8120 #endif 8121 } 8122 break; 8123 8124 default: 8125 KMP_ASSERT(0); // "unsupported method specified" 8126 } 8127 8128 retval = forced_retval; 8129 } 8130 8131 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8132 8133 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8134 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8135 8136 return (retval); 8137 } 8138 8139 // this function is for testing set/get/determine reduce method 8140 kmp_int32 __kmp_get_reduce_method(void) { 8141 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8142 } 8143 8144 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8145 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8146 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8147 8148 // Hard pause shuts down the runtime completely. Resume happens naturally when 8149 // OpenMP is used subsequently. 8150 void __kmp_hard_pause() { 8151 __kmp_pause_status = kmp_hard_paused; 8152 __kmp_internal_end_thread(-1); 8153 } 8154 8155 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8156 void __kmp_resume_if_soft_paused() { 8157 if (__kmp_pause_status == kmp_soft_paused) { 8158 __kmp_pause_status = kmp_not_paused; 8159 8160 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8161 kmp_info_t *thread = __kmp_threads[gtid]; 8162 if (thread) { // Wake it if sleeping 8163 kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread); 8164 if (fl.is_sleeping()) 8165 fl.resume(gtid); 8166 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8167 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8168 } else { // thread holds the lock and may sleep soon 8169 do { // until either the thread sleeps, or we can get the lock 8170 if (fl.is_sleeping()) { 8171 fl.resume(gtid); 8172 break; 8173 } else if (__kmp_try_suspend_mx(thread)) { 8174 __kmp_unlock_suspend_mx(thread); 8175 break; 8176 } 8177 } while (1); 8178 } 8179 } 8180 } 8181 } 8182 } 8183 8184 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8185 // TODO: add warning messages 8186 int __kmp_pause_resource(kmp_pause_status_t level) { 8187 if (level == kmp_not_paused) { // requesting resume 8188 if (__kmp_pause_status == kmp_not_paused) { 8189 // error message about runtime not being paused, so can't resume 8190 return 1; 8191 } else { 8192 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8193 __kmp_pause_status == kmp_hard_paused); 8194 __kmp_pause_status = kmp_not_paused; 8195 return 0; 8196 } 8197 } else if (level == kmp_soft_paused) { // requesting soft pause 8198 if (__kmp_pause_status != kmp_not_paused) { 8199 // error message about already being paused 8200 return 1; 8201 } else { 8202 __kmp_soft_pause(); 8203 return 0; 8204 } 8205 } else if (level == kmp_hard_paused) { // requesting hard pause 8206 if (__kmp_pause_status != kmp_not_paused) { 8207 // error message about already being paused 8208 return 1; 8209 } else { 8210 __kmp_hard_pause(); 8211 return 0; 8212 } 8213 } else { 8214 // error message about invalid level 8215 return 1; 8216 } 8217 } 8218