1 /* 2 * kmp_barrier.cpp 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp_wait_release.h" 14 #include "kmp_barrier.h" 15 #include "kmp_itt.h" 16 #include "kmp_os.h" 17 #include "kmp_stats.h" 18 #include "ompt-specific.h" 19 // for distributed barrier 20 #include "kmp_affinity.h" 21 22 #if KMP_MIC 23 #include <immintrin.h> 24 #define USE_NGO_STORES 1 25 #endif // KMP_MIC 26 27 #if KMP_MIC && USE_NGO_STORES 28 // ICV copying 29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) 30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory") 33 #else 34 #define ngo_load(src) ((void)0) 35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) 36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) 37 #define ngo_sync() ((void)0) 38 #endif /* KMP_MIC && USE_NGO_STORES */ 39 40 void __kmp_print_structure(void); // Forward declaration 41 42 // ---------------------------- Barrier Algorithms ---------------------------- 43 // Distributed barrier 44 45 // Compute how many threads to have polling each cache-line. 46 // We want to limit the number of writes to IDEAL_GO_RESOLUTION. 47 void distributedBarrier::computeVarsForN(size_t n) { 48 int nsockets = 1; 49 if (__kmp_topology) { 50 int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET); 51 int core_level = __kmp_topology->get_level(KMP_HW_CORE); 52 int ncores_per_socket = 53 __kmp_topology->calculate_ratio(core_level, socket_level); 54 nsockets = __kmp_topology->get_count(socket_level); 55 56 if (nsockets <= 0) 57 nsockets = 1; 58 if (ncores_per_socket <= 0) 59 ncores_per_socket = 1; 60 61 threads_per_go = ncores_per_socket >> 1; 62 if (!fix_threads_per_go) { 63 // Minimize num_gos 64 if (threads_per_go > 4) { 65 if (KMP_OPTIMIZE_FOR_REDUCTIONS) { 66 threads_per_go = threads_per_go >> 1; 67 } 68 if (threads_per_go > 4 && nsockets == 1) 69 threads_per_go = threads_per_go >> 1; 70 } 71 } 72 if (threads_per_go == 0) 73 threads_per_go = 1; 74 fix_threads_per_go = true; 75 num_gos = n / threads_per_go; 76 if (n % threads_per_go) 77 num_gos++; 78 if (nsockets == 1 || num_gos == 1) 79 num_groups = 1; 80 else { 81 num_groups = num_gos / nsockets; 82 if (num_gos % nsockets) 83 num_groups++; 84 } 85 if (num_groups <= 0) 86 num_groups = 1; 87 gos_per_group = num_gos / num_groups; 88 if (num_gos % num_groups) 89 gos_per_group++; 90 threads_per_group = threads_per_go * gos_per_group; 91 } else { 92 num_gos = n / threads_per_go; 93 if (n % threads_per_go) 94 num_gos++; 95 if (num_gos == 1) 96 num_groups = 1; 97 else { 98 num_groups = num_gos / 2; 99 if (num_gos % 2) 100 num_groups++; 101 } 102 gos_per_group = num_gos / num_groups; 103 if (num_gos % num_groups) 104 gos_per_group++; 105 threads_per_group = threads_per_go * gos_per_group; 106 } 107 } 108 109 void distributedBarrier::computeGo(size_t n) { 110 // Minimize num_gos 111 for (num_gos = 1;; num_gos++) 112 if (IDEAL_CONTENTION * num_gos >= n) 113 break; 114 threads_per_go = n / num_gos; 115 if (n % num_gos) 116 threads_per_go++; 117 while (num_gos > MAX_GOS) { 118 threads_per_go++; 119 num_gos = n / threads_per_go; 120 if (n % threads_per_go) 121 num_gos++; 122 } 123 computeVarsForN(n); 124 } 125 126 // This function is to resize the barrier arrays when the new number of threads 127 // exceeds max_threads, which is the current size of all the arrays 128 void distributedBarrier::resize(size_t nthr) { 129 KMP_DEBUG_ASSERT(nthr > max_threads); 130 131 // expand to requested size * 2 132 max_threads = nthr * 2; 133 134 // allocate arrays to new max threads 135 for (int i = 0; i < MAX_ITERS; ++i) { 136 if (flags[i]) 137 flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i], 138 max_threads * sizeof(flags_s)); 139 else 140 flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s)); 141 } 142 143 if (go) 144 go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s)); 145 else 146 go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s)); 147 148 if (iter) 149 iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s)); 150 else 151 iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s)); 152 153 if (sleep) 154 sleep = 155 (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s)); 156 else 157 sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s)); 158 } 159 160 // This function is to set all the go flags that threads might be waiting 161 // on, and when blocktime is not infinite, it should be followed by a wake-up 162 // call to each thread 163 kmp_uint64 distributedBarrier::go_release() { 164 kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS; 165 for (size_t j = 0; j < num_gos; j++) { 166 go[j].go.store(next_go); 167 } 168 return next_go; 169 } 170 171 void distributedBarrier::go_reset() { 172 for (size_t j = 0; j < max_threads; ++j) { 173 for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) { 174 flags[i][j].stillNeed = 1; 175 } 176 go[j].go.store(0); 177 iter[j].iter = 0; 178 } 179 } 180 181 // This function inits/re-inits the distributed barrier for a particular number 182 // of threads. If a resize of arrays is needed, it calls the resize function. 183 void distributedBarrier::init(size_t nthr) { 184 size_t old_max = max_threads; 185 if (nthr > max_threads) { // need more space in arrays 186 resize(nthr); 187 } 188 189 for (size_t i = 0; i < max_threads; i++) { 190 for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) { 191 flags[j][i].stillNeed = 1; 192 } 193 go[i].go.store(0); 194 iter[i].iter = 0; 195 if (i >= old_max) 196 sleep[i].sleep = false; 197 } 198 199 // Recalculate num_gos, etc. based on new nthr 200 computeVarsForN(nthr); 201 202 num_threads = nthr; 203 204 if (team_icvs == NULL) 205 team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t)); 206 } 207 208 // This function is used only when KMP_BLOCKTIME is not infinite. 209 // static 210 void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team, 211 size_t start, size_t stop, size_t inc, 212 size_t tid) { 213 KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME); 214 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 215 return; 216 217 kmp_info_t **other_threads = team->t.t_threads; 218 for (size_t thr = start; thr < stop; thr += inc) { 219 KMP_DEBUG_ASSERT(other_threads[thr]); 220 int gtid = other_threads[thr]->th.th_info.ds.ds_gtid; 221 // Wake up worker regardless of if it appears to be sleeping or not 222 __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL); 223 } 224 } 225 226 static void __kmp_dist_barrier_gather( 227 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 228 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 229 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather); 230 kmp_team_t *team; 231 distributedBarrier *b; 232 kmp_info_t **other_threads; 233 kmp_uint64 my_current_iter, my_next_iter; 234 kmp_uint32 nproc; 235 bool group_leader; 236 237 team = this_thr->th.th_team; 238 nproc = this_thr->th.th_team_nproc; 239 other_threads = team->t.t_threads; 240 b = team->t.b; 241 my_current_iter = b->iter[tid].iter; 242 my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS; 243 group_leader = ((tid % b->threads_per_group) == 0); 244 245 KA_TRACE(20, 246 ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n", 247 gtid, team->t.t_id, tid, bt)); 248 249 #if USE_ITT_BUILD && USE_ITT_NOTIFY 250 // Barrier imbalance - save arrive time to the thread 251 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 252 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 253 __itt_get_timestamp(); 254 } 255 #endif 256 257 if (group_leader) { 258 // Start from the thread after the group leader 259 size_t group_start = tid + 1; 260 size_t group_end = tid + b->threads_per_group; 261 size_t threads_pending = 0; 262 263 if (group_end > nproc) 264 group_end = nproc; 265 do { // wait for threads in my group 266 threads_pending = 0; 267 // Check all the flags every time to avoid branch misspredict 268 for (size_t thr = group_start; thr < group_end; thr++) { 269 // Each thread uses a different cache line 270 threads_pending += b->flags[my_current_iter][thr].stillNeed; 271 } 272 // Execute tasks here 273 if (__kmp_tasking_mode != tskm_immediate_exec) { 274 kmp_task_team_t *task_team = this_thr->th.th_task_team; 275 if (task_team != NULL) { 276 if (TCR_SYNC_4(task_team->tt.tt_active)) { 277 if (KMP_TASKING_ENABLED(task_team)) { 278 int tasks_completed = FALSE; 279 __kmp_atomic_execute_tasks_64( 280 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE, 281 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0); 282 } else 283 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 284 } 285 } else { 286 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 287 } // if 288 } 289 if (TCR_4(__kmp_global.g.g_done)) { 290 if (__kmp_global.g.g_abort) 291 __kmp_abort_thread(); 292 break; 293 } else if (__kmp_tasking_mode != tskm_immediate_exec && 294 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) { 295 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 296 } 297 } while (threads_pending > 0); 298 299 if (reduce) { // Perform reduction if needed 300 OMPT_REDUCTION_DECL(this_thr, gtid); 301 OMPT_REDUCTION_BEGIN; 302 // Group leader reduces all threads in group 303 for (size_t thr = group_start; thr < group_end; thr++) { 304 (*reduce)(this_thr->th.th_local.reduce_data, 305 other_threads[thr]->th.th_local.reduce_data); 306 } 307 OMPT_REDUCTION_END; 308 } 309 310 // Set flag for next iteration 311 b->flags[my_next_iter][tid].stillNeed = 1; 312 // Each thread uses a different cache line; resets stillNeed to 0 to 313 // indicate it has reached the barrier 314 b->flags[my_current_iter][tid].stillNeed = 0; 315 316 do { // wait for all group leaders 317 threads_pending = 0; 318 for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) { 319 threads_pending += b->flags[my_current_iter][thr].stillNeed; 320 } 321 // Execute tasks here 322 if (__kmp_tasking_mode != tskm_immediate_exec) { 323 kmp_task_team_t *task_team = this_thr->th.th_task_team; 324 if (task_team != NULL) { 325 if (TCR_SYNC_4(task_team->tt.tt_active)) { 326 if (KMP_TASKING_ENABLED(task_team)) { 327 int tasks_completed = FALSE; 328 __kmp_atomic_execute_tasks_64( 329 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE, 330 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0); 331 } else 332 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 333 } 334 } else { 335 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 336 } // if 337 } 338 if (TCR_4(__kmp_global.g.g_done)) { 339 if (__kmp_global.g.g_abort) 340 __kmp_abort_thread(); 341 break; 342 } else if (__kmp_tasking_mode != tskm_immediate_exec && 343 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) { 344 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 345 } 346 } while (threads_pending > 0); 347 348 if (reduce) { // Perform reduction if needed 349 if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders 350 OMPT_REDUCTION_DECL(this_thr, gtid); 351 OMPT_REDUCTION_BEGIN; 352 for (size_t thr = b->threads_per_group; thr < nproc; 353 thr += b->threads_per_group) { 354 (*reduce)(this_thr->th.th_local.reduce_data, 355 other_threads[thr]->th.th_local.reduce_data); 356 } 357 OMPT_REDUCTION_END; 358 } 359 } 360 } else { 361 // Set flag for next iteration 362 b->flags[my_next_iter][tid].stillNeed = 1; 363 // Each thread uses a different cache line; resets stillNeed to 0 to 364 // indicate it has reached the barrier 365 b->flags[my_current_iter][tid].stillNeed = 0; 366 } 367 368 KMP_MFENCE(); 369 370 KA_TRACE(20, 371 ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 372 gtid, team->t.t_id, tid, bt)); 373 } 374 375 static void __kmp_dist_barrier_release( 376 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 377 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 378 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release); 379 kmp_team_t *team; 380 distributedBarrier *b; 381 kmp_bstate_t *thr_bar; 382 kmp_uint64 my_current_iter, next_go; 383 size_t my_go_index; 384 bool group_leader; 385 386 KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n", 387 gtid, tid, bt)); 388 389 thr_bar = &this_thr->th.th_bar[bt].bb; 390 391 if (!KMP_MASTER_TID(tid)) { 392 // workers and non-master group leaders need to check their presence in team 393 do { 394 if (this_thr->th.th_used_in_team.load() != 1 && 395 this_thr->th.th_used_in_team.load() != 3) { 396 // Thread is not in use in a team. Wait on location in tid's thread 397 // struct. The 0 value tells anyone looking that this thread is spinning 398 // or sleeping until this location becomes 3 again; 3 is the transition 399 // state to get to 1 which is waiting on go and being in the team 400 kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3); 401 if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2, 402 0) || 403 this_thr->th.th_used_in_team.load() == 0) { 404 my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj)); 405 } 406 #if USE_ITT_BUILD && USE_ITT_NOTIFY 407 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 408 // In fork barrier where we could not get the object reliably 409 itt_sync_obj = 410 __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 411 // Cancel wait on previous parallel region... 412 __kmp_itt_task_starting(itt_sync_obj); 413 414 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 415 return; 416 417 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 418 if (itt_sync_obj != NULL) 419 // Call prepare as early as possible for "new" barrier 420 __kmp_itt_task_finished(itt_sync_obj); 421 } else 422 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 423 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 424 return; 425 } 426 if (this_thr->th.th_used_in_team.load() != 1 && 427 this_thr->th.th_used_in_team.load() != 3) // spurious wake-up? 428 continue; 429 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 430 return; 431 432 // At this point, the thread thinks it is in use in a team, or in 433 // transition to be used in a team, but it might have reached this barrier 434 // before it was marked unused by the team. Unused threads are awoken and 435 // shifted to wait on local thread struct elsewhere. It also might reach 436 // this point by being picked up for use by a different team. Either way, 437 // we need to update the tid. 438 tid = __kmp_tid_from_gtid(gtid); 439 team = this_thr->th.th_team; 440 KMP_DEBUG_ASSERT(tid >= 0); 441 KMP_DEBUG_ASSERT(team); 442 b = team->t.b; 443 my_current_iter = b->iter[tid].iter; 444 next_go = my_current_iter + distributedBarrier::MAX_ITERS; 445 my_go_index = tid / b->threads_per_go; 446 if (this_thr->th.th_used_in_team.load() == 3) { 447 KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1); 448 } 449 // Check if go flag is set 450 if (b->go[my_go_index].go.load() != next_go) { 451 // Wait on go flag on team 452 kmp_atomic_flag_64<false, true> my_flag( 453 &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep)); 454 my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj)); 455 KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter || 456 b->iter[tid].iter == 0); 457 KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false); 458 } 459 460 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 461 return; 462 // At this point, the thread's go location was set. This means the primary 463 // thread is safely in the barrier, and so this thread's data is 464 // up-to-date, but we should check again that this thread is really in 465 // use in the team, as it could have been woken up for the purpose of 466 // changing team size, or reaping threads at shutdown. 467 if (this_thr->th.th_used_in_team.load() == 1) 468 break; 469 } while (1); 470 471 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 472 return; 473 474 group_leader = ((tid % b->threads_per_group) == 0); 475 if (group_leader) { 476 // Tell all the threads in my group they can go! 477 for (size_t go_idx = my_go_index + 1; 478 go_idx < my_go_index + b->gos_per_group; go_idx++) { 479 b->go[go_idx].go.store(next_go); 480 } 481 // Fence added so that workers can see changes to go. sfence inadequate. 482 KMP_MFENCE(); 483 } 484 485 #if KMP_BARRIER_ICV_PUSH 486 if (propagate_icvs) { // copy ICVs to final dest 487 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, 488 tid, FALSE); 489 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 490 (kmp_internal_control_t *)team->t.b->team_icvs); 491 copy_icvs(&thr_bar->th_fixed_icvs, 492 &team->t.t_implicit_task_taskdata[tid].td_icvs); 493 } 494 #endif 495 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) { 496 // This thread is now awake and participating in the barrier; 497 // wake up the other threads in the group 498 size_t nproc = this_thr->th.th_team_nproc; 499 size_t group_end = tid + b->threads_per_group; 500 if (nproc < group_end) 501 group_end = nproc; 502 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid); 503 } 504 } else { // Primary thread 505 team = this_thr->th.th_team; 506 b = team->t.b; 507 my_current_iter = b->iter[tid].iter; 508 next_go = my_current_iter + distributedBarrier::MAX_ITERS; 509 #if KMP_BARRIER_ICV_PUSH 510 if (propagate_icvs) { 511 // primary thread has ICVs in final destination; copy 512 copy_icvs(&thr_bar->th_fixed_icvs, 513 &team->t.t_implicit_task_taskdata[tid].td_icvs); 514 } 515 #endif 516 // Tell all the group leaders they can go! 517 for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) { 518 b->go[go_idx].go.store(next_go); 519 } 520 521 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 522 // Wake-up the group leaders 523 size_t nproc = this_thr->th.th_team_nproc; 524 __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc, 525 b->threads_per_group, tid); 526 } 527 528 // Tell all the threads in my group they can go! 529 for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) { 530 b->go[go_idx].go.store(next_go); 531 } 532 533 // Fence added so that workers can see changes to go. sfence inadequate. 534 KMP_MFENCE(); 535 536 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 537 // Wake-up the other threads in my group 538 size_t nproc = this_thr->th.th_team_nproc; 539 size_t group_end = tid + b->threads_per_group; 540 if (nproc < group_end) 541 group_end = nproc; 542 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid); 543 } 544 } 545 // Update to next iteration 546 KMP_ASSERT(my_current_iter == b->iter[tid].iter); 547 b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS; 548 549 KA_TRACE( 550 20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 551 gtid, team->t.t_id, tid, bt)); 552 } 553 554 // Linear Barrier 555 template <bool cancellable = false> 556 static bool __kmp_linear_barrier_gather_template( 557 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 558 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 559 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather); 560 kmp_team_t *team = this_thr->th.th_team; 561 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 562 kmp_info_t **other_threads = team->t.t_threads; 563 564 KA_TRACE( 565 20, 566 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 567 gtid, team->t.t_id, tid, bt)); 568 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 569 570 #if USE_ITT_BUILD && USE_ITT_NOTIFY 571 // Barrier imbalance - save arrive time to the thread 572 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 573 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 574 __itt_get_timestamp(); 575 } 576 #endif 577 // We now perform a linear reduction to signal that all of the threads have 578 // arrived. 579 if (!KMP_MASTER_TID(tid)) { 580 KA_TRACE(20, 581 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" 582 "arrived(%p): %llu => %llu\n", 583 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team), 584 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived, 585 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 586 // Mark arrival to primary thread 587 /* After performing this write, a worker thread may not assume that the team 588 is valid any more - it could be deallocated by the primary thread at any 589 time. */ 590 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]); 591 flag.release(); 592 } else { 593 kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; 594 int nproc = this_thr->th.th_team_nproc; 595 int i; 596 // Don't have to worry about sleep bit here or atomic since team setting 597 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; 598 599 // Collect all the worker team member threads. 600 for (i = 1; i < nproc; ++i) { 601 #if KMP_CACHE_MANAGE 602 // Prefetch next thread's arrived count 603 if (i + 1 < nproc) 604 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived); 605 #endif /* KMP_CACHE_MANAGE */ 606 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 607 "arrived(%p) == %llu\n", 608 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 609 team->t.t_id, i, 610 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); 611 612 // Wait for worker thread to arrive 613 if (cancellable) { 614 kmp_flag_64<true, false> flag( 615 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state); 616 if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj))) 617 return true; 618 } else { 619 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, 620 new_state); 621 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 622 } 623 #if USE_ITT_BUILD && USE_ITT_NOTIFY 624 // Barrier imbalance - write min of the thread time and the other thread 625 // time to the thread. 626 if (__kmp_forkjoin_frames_mode == 2) { 627 this_thr->th.th_bar_min_time = KMP_MIN( 628 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time); 629 } 630 #endif 631 if (reduce) { 632 KA_TRACE(100, 633 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 634 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 635 team->t.t_id, i)); 636 OMPT_REDUCTION_DECL(this_thr, gtid); 637 OMPT_REDUCTION_BEGIN; 638 (*reduce)(this_thr->th.th_local.reduce_data, 639 other_threads[i]->th.th_local.reduce_data); 640 OMPT_REDUCTION_END; 641 } 642 } 643 // Don't have to worry about sleep bit here or atomic since team setting 644 team_bar->b_arrived = new_state; 645 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d " 646 "arrived(%p) = %llu\n", 647 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, 648 new_state)); 649 } 650 KA_TRACE( 651 20, 652 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 653 gtid, team->t.t_id, tid, bt)); 654 return false; 655 } 656 657 template <bool cancellable = false> 658 static bool __kmp_linear_barrier_release_template( 659 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 660 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 661 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release); 662 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 663 kmp_team_t *team; 664 665 if (KMP_MASTER_TID(tid)) { 666 unsigned int i; 667 kmp_uint32 nproc = this_thr->th.th_team_nproc; 668 kmp_info_t **other_threads; 669 670 team = __kmp_threads[gtid]->th.th_team; 671 KMP_DEBUG_ASSERT(team != NULL); 672 other_threads = team->t.t_threads; 673 674 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for " 675 "barrier type %d\n", 676 gtid, team->t.t_id, tid, bt)); 677 678 if (nproc > 1) { 679 #if KMP_BARRIER_ICV_PUSH 680 { 681 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 682 if (propagate_icvs) { 683 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); 684 for (i = 1; i < nproc; ++i) { 685 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], 686 team, i, FALSE); 687 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, 688 &team->t.t_implicit_task_taskdata[0].td_icvs); 689 } 690 ngo_sync(); 691 } 692 } 693 #endif // KMP_BARRIER_ICV_PUSH 694 695 // Now, release all of the worker threads 696 for (i = 1; i < nproc; ++i) { 697 #if KMP_CACHE_MANAGE 698 // Prefetch next thread's go flag 699 if (i + 1 < nproc) 700 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go); 701 #endif /* KMP_CACHE_MANAGE */ 702 KA_TRACE( 703 20, 704 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " 705 "go(%p): %u => %u\n", 706 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid, 707 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go, 708 other_threads[i]->th.th_bar[bt].bb.b_go, 709 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); 710 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go, 711 other_threads[i]); 712 flag.release(); 713 } 714 } 715 } else { // Wait for the PRIMARY thread to release us 716 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", 717 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 718 if (cancellable) { 719 kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 720 if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj))) 721 return true; 722 } else { 723 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 724 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 725 } 726 #if USE_ITT_BUILD && USE_ITT_NOTIFY 727 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 728 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is 729 // disabled) 730 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 731 // Cancel wait on previous parallel region... 732 __kmp_itt_task_starting(itt_sync_obj); 733 734 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 735 return false; 736 737 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 738 if (itt_sync_obj != NULL) 739 // Call prepare as early as possible for "new" barrier 740 __kmp_itt_task_finished(itt_sync_obj); 741 } else 742 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 743 // Early exit for reaping threads releasing forkjoin barrier 744 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 745 return false; 746 // The worker thread may now assume that the team is valid. 747 #ifdef KMP_DEBUG 748 tid = __kmp_tid_from_gtid(gtid); 749 team = __kmp_threads[gtid]->th.th_team; 750 #endif 751 KMP_DEBUG_ASSERT(team != NULL); 752 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 753 KA_TRACE(20, 754 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 755 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 756 KMP_MB(); // Flush all pending memory write invalidates. 757 } 758 KA_TRACE( 759 20, 760 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 761 gtid, team->t.t_id, tid, bt)); 762 return false; 763 } 764 765 static void __kmp_linear_barrier_gather( 766 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 767 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 768 __kmp_linear_barrier_gather_template<false>( 769 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 770 } 771 772 static bool __kmp_linear_barrier_gather_cancellable( 773 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 774 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 775 return __kmp_linear_barrier_gather_template<true>( 776 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 777 } 778 779 static void __kmp_linear_barrier_release( 780 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 781 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 782 __kmp_linear_barrier_release_template<false>( 783 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); 784 } 785 786 static bool __kmp_linear_barrier_release_cancellable( 787 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 788 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 789 return __kmp_linear_barrier_release_template<true>( 790 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); 791 } 792 793 // Tree barrier 794 static void __kmp_tree_barrier_gather( 795 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 796 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 797 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather); 798 kmp_team_t *team = this_thr->th.th_team; 799 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 800 kmp_info_t **other_threads = team->t.t_threads; 801 kmp_uint32 nproc = this_thr->th.th_team_nproc; 802 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 803 kmp_uint32 branch_factor = 1 << branch_bits; 804 kmp_uint32 child; 805 kmp_uint32 child_tid; 806 kmp_uint64 new_state = 0; 807 808 KA_TRACE( 809 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 810 gtid, team->t.t_id, tid, bt)); 811 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 812 813 #if USE_ITT_BUILD && USE_ITT_NOTIFY 814 // Barrier imbalance - save arrive time to the thread 815 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 816 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 817 __itt_get_timestamp(); 818 } 819 #endif 820 // Perform tree gather to wait until all threads have arrived; reduce any 821 // required data as we go 822 child_tid = (tid << branch_bits) + 1; 823 if (child_tid < nproc) { 824 // Parent threads wait for all their children to arrive 825 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 826 child = 1; 827 do { 828 kmp_info_t *child_thr = other_threads[child_tid]; 829 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 830 #if KMP_CACHE_MANAGE 831 // Prefetch next thread's arrived count 832 if (child + 1 <= branch_factor && child_tid + 1 < nproc) 833 KMP_CACHE_PREFETCH( 834 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived); 835 #endif /* KMP_CACHE_MANAGE */ 836 KA_TRACE(20, 837 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 838 "arrived(%p) == %llu\n", 839 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 840 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 841 // Wait for child to arrive 842 kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 843 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 844 #if USE_ITT_BUILD && USE_ITT_NOTIFY 845 // Barrier imbalance - write min of the thread time and a child time to 846 // the thread. 847 if (__kmp_forkjoin_frames_mode == 2) { 848 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 849 child_thr->th.th_bar_min_time); 850 } 851 #endif 852 if (reduce) { 853 KA_TRACE(100, 854 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 855 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 856 team->t.t_id, child_tid)); 857 OMPT_REDUCTION_DECL(this_thr, gtid); 858 OMPT_REDUCTION_BEGIN; 859 (*reduce)(this_thr->th.th_local.reduce_data, 860 child_thr->th.th_local.reduce_data); 861 OMPT_REDUCTION_END; 862 } 863 child++; 864 child_tid++; 865 } while (child <= branch_factor && child_tid < nproc); 866 } 867 868 if (!KMP_MASTER_TID(tid)) { // Worker threads 869 kmp_int32 parent_tid = (tid - 1) >> branch_bits; 870 871 KA_TRACE(20, 872 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 873 "arrived(%p): %llu => %llu\n", 874 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 875 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 876 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 877 878 // Mark arrival to parent thread 879 /* After performing this write, a worker thread may not assume that the team 880 is valid any more - it could be deallocated by the primary thread at any 881 time. */ 882 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]); 883 flag.release(); 884 } else { 885 // Need to update the team arrived pointer if we are the primary thread 886 if (nproc > 1) // New value was already computed above 887 team->t.t_bar[bt].b_arrived = new_state; 888 else 889 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 890 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d " 891 "arrived(%p) = %llu\n", 892 gtid, team->t.t_id, tid, team->t.t_id, 893 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 894 } 895 KA_TRACE(20, 896 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 897 gtid, team->t.t_id, tid, bt)); 898 } 899 900 static void __kmp_tree_barrier_release( 901 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 902 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 903 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release); 904 kmp_team_t *team; 905 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 906 kmp_uint32 nproc; 907 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 908 kmp_uint32 branch_factor = 1 << branch_bits; 909 kmp_uint32 child; 910 kmp_uint32 child_tid; 911 912 // Perform a tree release for all of the threads that have been gathered 913 if (!KMP_MASTER_TID( 914 tid)) { // Handle fork barrier workers who aren't part of a team yet 915 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid, 916 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 917 // Wait for parent thread to release us 918 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 919 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 920 #if USE_ITT_BUILD && USE_ITT_NOTIFY 921 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 922 // In fork barrier where we could not get the object reliably (or 923 // ITTNOTIFY is disabled) 924 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 925 // Cancel wait on previous parallel region... 926 __kmp_itt_task_starting(itt_sync_obj); 927 928 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 929 return; 930 931 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 932 if (itt_sync_obj != NULL) 933 // Call prepare as early as possible for "new" barrier 934 __kmp_itt_task_finished(itt_sync_obj); 935 } else 936 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 937 // Early exit for reaping threads releasing forkjoin barrier 938 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 939 return; 940 941 // The worker thread may now assume that the team is valid. 942 team = __kmp_threads[gtid]->th.th_team; 943 KMP_DEBUG_ASSERT(team != NULL); 944 tid = __kmp_tid_from_gtid(gtid); 945 946 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 947 KA_TRACE(20, 948 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid, 949 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 950 KMP_MB(); // Flush all pending memory write invalidates. 951 } else { 952 team = __kmp_threads[gtid]->th.th_team; 953 KMP_DEBUG_ASSERT(team != NULL); 954 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for " 955 "barrier type %d\n", 956 gtid, team->t.t_id, tid, bt)); 957 } 958 nproc = this_thr->th.th_team_nproc; 959 child_tid = (tid << branch_bits) + 1; 960 961 if (child_tid < nproc) { 962 kmp_info_t **other_threads = team->t.t_threads; 963 child = 1; 964 // Parent threads release all their children 965 do { 966 kmp_info_t *child_thr = other_threads[child_tid]; 967 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 968 #if KMP_CACHE_MANAGE 969 // Prefetch next thread's go count 970 if (child + 1 <= branch_factor && child_tid + 1 < nproc) 971 KMP_CACHE_PREFETCH( 972 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go); 973 #endif /* KMP_CACHE_MANAGE */ 974 975 #if KMP_BARRIER_ICV_PUSH 976 { 977 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 978 if (propagate_icvs) { 979 __kmp_init_implicit_task(team->t.t_ident, 980 team->t.t_threads[child_tid], team, 981 child_tid, FALSE); 982 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, 983 &team->t.t_implicit_task_taskdata[0].td_icvs); 984 } 985 } 986 #endif // KMP_BARRIER_ICV_PUSH 987 KA_TRACE(20, 988 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 989 "go(%p): %u => %u\n", 990 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 991 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 992 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 993 // Release child from barrier 994 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 995 flag.release(); 996 child++; 997 child_tid++; 998 } while (child <= branch_factor && child_tid < nproc); 999 } 1000 KA_TRACE( 1001 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 1002 gtid, team->t.t_id, tid, bt)); 1003 } 1004 1005 // Hyper Barrier 1006 static void __kmp_hyper_barrier_gather( 1007 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1008 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 1009 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather); 1010 kmp_team_t *team = this_thr->th.th_team; 1011 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 1012 kmp_info_t **other_threads = team->t.t_threads; 1013 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE; 1014 kmp_uint32 num_threads = this_thr->th.th_team_nproc; 1015 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 1016 kmp_uint32 branch_factor = 1 << branch_bits; 1017 kmp_uint32 offset; 1018 kmp_uint32 level; 1019 1020 KA_TRACE( 1021 20, 1022 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 1023 gtid, team->t.t_id, tid, bt)); 1024 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 1025 1026 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1027 // Barrier imbalance - save arrive time to the thread 1028 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 1029 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 1030 __itt_get_timestamp(); 1031 } 1032 #endif 1033 /* Perform a hypercube-embedded tree gather to wait until all of the threads 1034 have arrived, and reduce any required data as we go. */ 1035 kmp_flag_64<> p_flag(&thr_bar->b_arrived); 1036 for (level = 0, offset = 1; offset < num_threads; 1037 level += branch_bits, offset <<= branch_bits) { 1038 kmp_uint32 child; 1039 kmp_uint32 child_tid; 1040 1041 if (((tid >> level) & (branch_factor - 1)) != 0) { 1042 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1); 1043 1044 KMP_MB(); // Synchronize parent and child threads. 1045 KA_TRACE(20, 1046 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 1047 "arrived(%p): %llu => %llu\n", 1048 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 1049 team->t.t_id, parent_tid, &thr_bar->b_arrived, 1050 thr_bar->b_arrived, 1051 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 1052 // Mark arrival to parent thread 1053 /* After performing this write (in the last iteration of the enclosing for 1054 loop), a worker thread may not assume that the team is valid any more 1055 - it could be deallocated by the primary thread at any time. */ 1056 p_flag.set_waiter(other_threads[parent_tid]); 1057 p_flag.release(); 1058 break; 1059 } 1060 1061 // Parent threads wait for children to arrive 1062 if (new_state == KMP_BARRIER_UNUSED_STATE) 1063 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 1064 for (child = 1, child_tid = tid + (1 << level); 1065 child < branch_factor && child_tid < num_threads; 1066 child++, child_tid += (1 << level)) { 1067 kmp_info_t *child_thr = other_threads[child_tid]; 1068 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1069 #if KMP_CACHE_MANAGE 1070 kmp_uint32 next_child_tid = child_tid + (1 << level); 1071 // Prefetch next thread's arrived count 1072 if (child + 1 < branch_factor && next_child_tid < num_threads) 1073 KMP_CACHE_PREFETCH( 1074 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); 1075 #endif /* KMP_CACHE_MANAGE */ 1076 KA_TRACE(20, 1077 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 1078 "arrived(%p) == %llu\n", 1079 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1080 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 1081 // Wait for child to arrive 1082 kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state); 1083 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1084 KMP_MB(); // Synchronize parent and child threads. 1085 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1086 // Barrier imbalance - write min of the thread time and a child time to 1087 // the thread. 1088 if (__kmp_forkjoin_frames_mode == 2) { 1089 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 1090 child_thr->th.th_bar_min_time); 1091 } 1092 #endif 1093 if (reduce) { 1094 KA_TRACE(100, 1095 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 1096 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1097 team->t.t_id, child_tid)); 1098 OMPT_REDUCTION_DECL(this_thr, gtid); 1099 OMPT_REDUCTION_BEGIN; 1100 (*reduce)(this_thr->th.th_local.reduce_data, 1101 child_thr->th.th_local.reduce_data); 1102 OMPT_REDUCTION_END; 1103 } 1104 } 1105 } 1106 1107 if (KMP_MASTER_TID(tid)) { 1108 // Need to update the team arrived pointer if we are the primary thread 1109 if (new_state == KMP_BARRIER_UNUSED_STATE) 1110 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 1111 else 1112 team->t.t_bar[bt].b_arrived = new_state; 1113 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d " 1114 "arrived(%p) = %llu\n", 1115 gtid, team->t.t_id, tid, team->t.t_id, 1116 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 1117 } 1118 KA_TRACE( 1119 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 1120 gtid, team->t.t_id, tid, bt)); 1121 } 1122 1123 // The reverse versions seem to beat the forward versions overall 1124 #define KMP_REVERSE_HYPER_BAR 1125 static void __kmp_hyper_barrier_release( 1126 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1127 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 1128 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release); 1129 kmp_team_t *team; 1130 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 1131 kmp_info_t **other_threads; 1132 kmp_uint32 num_threads; 1133 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 1134 kmp_uint32 branch_factor = 1 << branch_bits; 1135 kmp_uint32 child; 1136 kmp_uint32 child_tid; 1137 kmp_uint32 offset; 1138 kmp_uint32 level; 1139 1140 /* Perform a hypercube-embedded tree release for all of the threads that have 1141 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads 1142 are released in the reverse order of the corresponding gather, otherwise 1143 threads are released in the same order. */ 1144 if (KMP_MASTER_TID(tid)) { // primary thread 1145 team = __kmp_threads[gtid]->th.th_team; 1146 KMP_DEBUG_ASSERT(team != NULL); 1147 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for " 1148 "barrier type %d\n", 1149 gtid, team->t.t_id, tid, bt)); 1150 #if KMP_BARRIER_ICV_PUSH 1151 if (propagate_icvs) { // primary already has ICVs in final destination; copy 1152 copy_icvs(&thr_bar->th_fixed_icvs, 1153 &team->t.t_implicit_task_taskdata[tid].td_icvs); 1154 } 1155 #endif 1156 } else { // Handle fork barrier workers who aren't part of a team yet 1157 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid, 1158 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 1159 // Wait for parent thread to release us 1160 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 1161 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1162 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1163 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 1164 // In fork barrier where we could not get the object reliably 1165 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 1166 // Cancel wait on previous parallel region... 1167 __kmp_itt_task_starting(itt_sync_obj); 1168 1169 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 1170 return; 1171 1172 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1173 if (itt_sync_obj != NULL) 1174 // Call prepare as early as possible for "new" barrier 1175 __kmp_itt_task_finished(itt_sync_obj); 1176 } else 1177 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1178 // Early exit for reaping threads releasing forkjoin barrier 1179 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 1180 return; 1181 1182 // The worker thread may now assume that the team is valid. 1183 team = __kmp_threads[gtid]->th.th_team; 1184 KMP_DEBUG_ASSERT(team != NULL); 1185 tid = __kmp_tid_from_gtid(gtid); 1186 1187 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 1188 KA_TRACE(20, 1189 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 1190 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 1191 KMP_MB(); // Flush all pending memory write invalidates. 1192 } 1193 num_threads = this_thr->th.th_team_nproc; 1194 other_threads = team->t.t_threads; 1195 1196 #ifdef KMP_REVERSE_HYPER_BAR 1197 // Count up to correct level for parent 1198 for (level = 0, offset = 1; 1199 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0); 1200 level += branch_bits, offset <<= branch_bits) 1201 ; 1202 1203 // Now go down from there 1204 for (level -= branch_bits, offset >>= branch_bits; offset != 0; 1205 level -= branch_bits, offset >>= branch_bits) 1206 #else 1207 // Go down the tree, level by level 1208 for (level = 0, offset = 1; offset < num_threads; 1209 level += branch_bits, offset <<= branch_bits) 1210 #endif // KMP_REVERSE_HYPER_BAR 1211 { 1212 #ifdef KMP_REVERSE_HYPER_BAR 1213 /* Now go in reverse order through the children, highest to lowest. 1214 Initial setting of child is conservative here. */ 1215 child = num_threads >> ((level == 0) ? level : level - 1); 1216 for (child = (child < branch_factor - 1) ? child : branch_factor - 1, 1217 child_tid = tid + (child << level); 1218 child >= 1; child--, child_tid -= (1 << level)) 1219 #else 1220 if (((tid >> level) & (branch_factor - 1)) != 0) 1221 // No need to go lower than this, since this is the level parent would be 1222 // notified 1223 break; 1224 // Iterate through children on this level of the tree 1225 for (child = 1, child_tid = tid + (1 << level); 1226 child < branch_factor && child_tid < num_threads; 1227 child++, child_tid += (1 << level)) 1228 #endif // KMP_REVERSE_HYPER_BAR 1229 { 1230 if (child_tid >= num_threads) 1231 continue; // Child doesn't exist so keep going 1232 else { 1233 kmp_info_t *child_thr = other_threads[child_tid]; 1234 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1235 #if KMP_CACHE_MANAGE 1236 kmp_uint32 next_child_tid = child_tid - (1 << level); 1237 // Prefetch next thread's go count 1238 #ifdef KMP_REVERSE_HYPER_BAR 1239 if (child - 1 >= 1 && next_child_tid < num_threads) 1240 #else 1241 if (child + 1 < branch_factor && next_child_tid < num_threads) 1242 #endif // KMP_REVERSE_HYPER_BAR 1243 KMP_CACHE_PREFETCH( 1244 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); 1245 #endif /* KMP_CACHE_MANAGE */ 1246 1247 #if KMP_BARRIER_ICV_PUSH 1248 if (propagate_icvs) // push my fixed ICVs to my child 1249 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 1250 #endif // KMP_BARRIER_ICV_PUSH 1251 1252 KA_TRACE( 1253 20, 1254 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 1255 "go(%p): %u => %u\n", 1256 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1257 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 1258 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1259 // Release child from barrier 1260 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 1261 flag.release(); 1262 } 1263 } 1264 } 1265 #if KMP_BARRIER_ICV_PUSH 1266 if (propagate_icvs && 1267 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest 1268 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 1269 FALSE); 1270 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1271 &thr_bar->th_fixed_icvs); 1272 } 1273 #endif 1274 KA_TRACE( 1275 20, 1276 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 1277 gtid, team->t.t_id, tid, bt)); 1278 } 1279 1280 // Hierarchical Barrier 1281 1282 // Initialize thread barrier data 1283 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. 1284 Performs the minimum amount of initialization required based on how the team 1285 has changed. Returns true if leaf children will require both on-core and 1286 traditional wake-up mechanisms. For example, if the team size increases, 1287 threads already in the team will respond to on-core wakeup on their parent 1288 thread, but threads newly added to the team will only be listening on the 1289 their local b_go. */ 1290 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, 1291 kmp_bstate_t *thr_bar, 1292 kmp_uint32 nproc, int gtid, 1293 int tid, kmp_team_t *team) { 1294 // Checks to determine if (re-)initialization is needed 1295 bool uninitialized = thr_bar->team == NULL; 1296 bool team_changed = team != thr_bar->team; 1297 bool team_sz_changed = nproc != thr_bar->nproc; 1298 bool tid_changed = tid != thr_bar->old_tid; 1299 bool retval = false; 1300 1301 if (uninitialized || team_sz_changed) { 1302 __kmp_get_hierarchy(nproc, thr_bar); 1303 } 1304 1305 if (uninitialized || team_sz_changed || tid_changed) { 1306 thr_bar->my_level = thr_bar->depth - 1; // default for primary thread 1307 thr_bar->parent_tid = -1; // default for primary thread 1308 if (!KMP_MASTER_TID(tid)) { 1309 // if not primary thread, find parent thread in hierarchy 1310 kmp_uint32 d = 0; 1311 while (d < thr_bar->depth) { // find parent based on level of thread in 1312 // hierarchy, and note level 1313 kmp_uint32 rem; 1314 if (d == thr_bar->depth - 2) { // reached level right below the primary 1315 thr_bar->parent_tid = 0; 1316 thr_bar->my_level = d; 1317 break; 1318 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) { 1319 // TODO: can we make the above op faster? 1320 // thread is not a subtree root at next level, so this is max 1321 thr_bar->parent_tid = tid - rem; 1322 thr_bar->my_level = d; 1323 break; 1324 } 1325 ++d; 1326 } 1327 } 1328 __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) / 1329 (thr_bar->skip_per_level[thr_bar->my_level])), 1330 &(thr_bar->offset)); 1331 thr_bar->old_tid = tid; 1332 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 1333 thr_bar->team = team; 1334 thr_bar->parent_bar = 1335 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 1336 } 1337 if (uninitialized || team_changed || tid_changed) { 1338 thr_bar->team = team; 1339 thr_bar->parent_bar = 1340 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 1341 retval = true; 1342 } 1343 if (uninitialized || team_sz_changed || tid_changed) { 1344 thr_bar->nproc = nproc; 1345 thr_bar->leaf_kids = thr_bar->base_leaf_kids; 1346 if (thr_bar->my_level == 0) 1347 thr_bar->leaf_kids = 0; 1348 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc) 1349 __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids)); 1350 thr_bar->leaf_state = 0; 1351 for (int i = 0; i < thr_bar->leaf_kids; ++i) 1352 ((char *)&(thr_bar->leaf_state))[7 - i] = 1; 1353 } 1354 return retval; 1355 } 1356 1357 static void __kmp_hierarchical_barrier_gather( 1358 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1359 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 1360 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather); 1361 kmp_team_t *team = this_thr->th.th_team; 1362 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 1363 kmp_uint32 nproc = this_thr->th.th_team_nproc; 1364 kmp_info_t **other_threads = team->t.t_threads; 1365 kmp_uint64 new_state = 0; 1366 1367 int level = team->t.t_level; 1368 if (other_threads[0] 1369 ->th.th_teams_microtask) // are we inside the teams construct? 1370 if (this_thr->th.th_teams_size.nteams > 1) 1371 ++level; // level was not increased in teams construct for team_of_masters 1372 if (level == 1) 1373 thr_bar->use_oncore_barrier = 1; 1374 else 1375 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 1376 1377 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for " 1378 "barrier type %d\n", 1379 gtid, team->t.t_id, tid, bt)); 1380 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 1381 1382 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1383 // Barrier imbalance - save arrive time to the thread 1384 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 1385 this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); 1386 } 1387 #endif 1388 1389 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, 1390 team); 1391 1392 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) 1393 kmp_int32 child_tid; 1394 new_state = 1395 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 1396 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1397 thr_bar->use_oncore_barrier) { 1398 if (thr_bar->leaf_kids) { 1399 // First, wait for leaf children to check-in on my b_arrived flag 1400 kmp_uint64 leaf_state = 1401 KMP_MASTER_TID(tid) 1402 ? thr_bar->b_arrived | thr_bar->leaf_state 1403 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; 1404 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting " 1405 "for leaf kids\n", 1406 gtid, team->t.t_id, tid)); 1407 kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state); 1408 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1409 if (reduce) { 1410 OMPT_REDUCTION_DECL(this_thr, gtid); 1411 OMPT_REDUCTION_BEGIN; 1412 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids; 1413 ++child_tid) { 1414 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 1415 "T#%d(%d:%d)\n", 1416 gtid, team->t.t_id, tid, 1417 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1418 child_tid)); 1419 (*reduce)(this_thr->th.th_local.reduce_data, 1420 other_threads[child_tid]->th.th_local.reduce_data); 1421 } 1422 OMPT_REDUCTION_END; 1423 } 1424 // clear leaf_state bits 1425 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state)); 1426 } 1427 // Next, wait for higher level children on each child's b_arrived flag 1428 for (kmp_uint32 d = 1; d < thr_bar->my_level; 1429 ++d) { // gather lowest level threads first, but skip 0 1430 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 1431 skip = thr_bar->skip_per_level[d]; 1432 if (last > nproc) 1433 last = nproc; 1434 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 1435 kmp_info_t *child_thr = other_threads[child_tid]; 1436 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1437 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 1438 "T#%d(%d:%d) " 1439 "arrived(%p) == %llu\n", 1440 gtid, team->t.t_id, tid, 1441 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1442 child_tid, &child_bar->b_arrived, new_state)); 1443 kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 1444 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1445 if (reduce) { 1446 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 1447 "T#%d(%d:%d)\n", 1448 gtid, team->t.t_id, tid, 1449 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1450 child_tid)); 1451 (*reduce)(this_thr->th.th_local.reduce_data, 1452 child_thr->th.th_local.reduce_data); 1453 } 1454 } 1455 } 1456 } else { // Blocktime is not infinite 1457 for (kmp_uint32 d = 0; d < thr_bar->my_level; 1458 ++d) { // Gather lowest level threads first 1459 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 1460 skip = thr_bar->skip_per_level[d]; 1461 if (last > nproc) 1462 last = nproc; 1463 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 1464 kmp_info_t *child_thr = other_threads[child_tid]; 1465 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1466 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 1467 "T#%d(%d:%d) " 1468 "arrived(%p) == %llu\n", 1469 gtid, team->t.t_id, tid, 1470 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1471 child_tid, &child_bar->b_arrived, new_state)); 1472 kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 1473 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1474 if (reduce) { 1475 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 1476 "T#%d(%d:%d)\n", 1477 gtid, team->t.t_id, tid, 1478 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1479 child_tid)); 1480 (*reduce)(this_thr->th.th_local.reduce_data, 1481 child_thr->th.th_local.reduce_data); 1482 } 1483 } 1484 } 1485 } 1486 } 1487 // All subordinates are gathered; now release parent if not primary thread 1488 1489 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy 1490 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing" 1491 " T#%d(%d:%d) arrived(%p): %llu => %llu\n", 1492 gtid, team->t.t_id, tid, 1493 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, 1494 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 1495 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 1496 /* Mark arrival to parent: After performing this write, a worker thread may 1497 not assume that the team is valid any more - it could be deallocated by 1498 the primary thread at any time. */ 1499 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || 1500 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived 1501 // flag; release it 1502 kmp_flag_64<> flag(&thr_bar->b_arrived, 1503 other_threads[thr_bar->parent_tid]); 1504 flag.release(); 1505 } else { 1506 // Leaf does special release on "offset" bits of parent's b_arrived flag 1507 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 1508 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, 1509 thr_bar->offset + 1); 1510 flag.set_waiter(other_threads[thr_bar->parent_tid]); 1511 flag.release(); 1512 } 1513 } else { // Primary thread needs to update the team's b_arrived value 1514 team->t.t_bar[bt].b_arrived = new_state; 1515 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d " 1516 "arrived(%p) = %llu\n", 1517 gtid, team->t.t_id, tid, team->t.t_id, 1518 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 1519 } 1520 // Is the team access below unsafe or just technically invalid? 1521 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for " 1522 "barrier type %d\n", 1523 gtid, team->t.t_id, tid, bt)); 1524 } 1525 1526 static void __kmp_hierarchical_barrier_release( 1527 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1528 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 1529 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release); 1530 kmp_team_t *team; 1531 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 1532 kmp_uint32 nproc; 1533 bool team_change = false; // indicates on-core barrier shouldn't be used 1534 1535 if (KMP_MASTER_TID(tid)) { 1536 team = __kmp_threads[gtid]->th.th_team; 1537 KMP_DEBUG_ASSERT(team != NULL); 1538 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary " 1539 "entered barrier type %d\n", 1540 gtid, team->t.t_id, tid, bt)); 1541 } else { // Worker threads 1542 // Wait for parent thread to release me 1543 if (!thr_bar->use_oncore_barrier || 1544 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 || 1545 thr_bar->team == NULL) { 1546 // Use traditional method of waiting on my own b_go flag 1547 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; 1548 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 1549 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1550 TCW_8(thr_bar->b_go, 1551 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1552 } else { // Thread barrier data is initialized, this is a leaf, blocktime is 1553 // infinite, not nested 1554 // Wait on my "offset" bits on parent's b_go flag 1555 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; 1556 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, 1557 thr_bar->offset + 1, bt, 1558 this_thr USE_ITT_BUILD_ARG(itt_sync_obj)); 1559 flag.wait(this_thr, TRUE); 1560 if (thr_bar->wait_flag == 1561 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go 1562 TCW_8(thr_bar->b_go, 1563 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1564 } else { // Reset my bits on parent's b_go flag 1565 (RCAST(volatile char *, 1566 &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0; 1567 } 1568 } 1569 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 1570 // Early exit for reaping threads releasing forkjoin barrier 1571 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 1572 return; 1573 // The worker thread may now assume that the team is valid. 1574 team = __kmp_threads[gtid]->th.th_team; 1575 KMP_DEBUG_ASSERT(team != NULL); 1576 tid = __kmp_tid_from_gtid(gtid); 1577 1578 KA_TRACE( 1579 20, 1580 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 1581 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 1582 KMP_MB(); // Flush all pending memory write invalidates. 1583 } 1584 1585 nproc = this_thr->th.th_team_nproc; 1586 int level = team->t.t_level; 1587 if (team->t.t_threads[0] 1588 ->th.th_teams_microtask) { // are we inside the teams construct? 1589 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 1590 this_thr->th.th_teams_level == level) 1591 ++level; // level was not increased in teams construct for team_of_workers 1592 if (this_thr->th.th_teams_size.nteams > 1) 1593 ++level; // level was not increased in teams construct for team_of_masters 1594 } 1595 if (level == 1) 1596 thr_bar->use_oncore_barrier = 1; 1597 else 1598 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 1599 1600 // If the team size has increased, we still communicate with old leaves via 1601 // oncore barrier. 1602 unsigned short int old_leaf_kids = thr_bar->leaf_kids; 1603 kmp_uint64 old_leaf_state = thr_bar->leaf_state; 1604 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, 1605 tid, team); 1606 // But if the entire team changes, we won't use oncore barrier at all 1607 if (team_change) 1608 old_leaf_kids = 0; 1609 1610 #if KMP_BARRIER_ICV_PUSH 1611 if (propagate_icvs) { 1612 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 1613 FALSE); 1614 if (KMP_MASTER_TID( 1615 tid)) { // primary already has copy in final destination; copy 1616 copy_icvs(&thr_bar->th_fixed_icvs, 1617 &team->t.t_implicit_task_taskdata[tid].td_icvs); 1618 } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1619 thr_bar->use_oncore_barrier) { // optimization for inf blocktime 1620 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) 1621 // leaves (on-core children) pull parent's fixed ICVs directly to local 1622 // ICV store 1623 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1624 &thr_bar->parent_bar->th_fixed_icvs); 1625 // non-leaves will get ICVs piggybacked with b_go via NGO store 1626 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs 1627 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can 1628 // access 1629 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs); 1630 else // leaves copy parent's fixed ICVs directly to local ICV store 1631 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1632 &thr_bar->parent_bar->th_fixed_icvs); 1633 } 1634 } 1635 #endif // KMP_BARRIER_ICV_PUSH 1636 1637 // Now, release my children 1638 if (thr_bar->my_level) { // not a leaf 1639 kmp_int32 child_tid; 1640 kmp_uint32 last; 1641 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1642 thr_bar->use_oncore_barrier) { 1643 if (KMP_MASTER_TID(tid)) { // do a flat release 1644 // Set local b_go to bump children via NGO store of the cache line 1645 // containing IVCs and b_go. 1646 thr_bar->b_go = KMP_BARRIER_STATE_BUMP; 1647 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of 1648 // the cache line 1649 ngo_load(&thr_bar->th_fixed_icvs); 1650 // This loops over all the threads skipping only the leaf nodes in the 1651 // hierarchy 1652 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc; 1653 child_tid += thr_bar->skip_per_level[1]) { 1654 kmp_bstate_t *child_bar = 1655 &team->t.t_threads[child_tid]->th.th_bar[bt].bb; 1656 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1657 "releasing T#%d(%d:%d)" 1658 " go(%p): %u => %u\n", 1659 gtid, team->t.t_id, tid, 1660 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1661 child_tid, &child_bar->b_go, child_bar->b_go, 1662 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1663 // Use ngo store (if available) to both store ICVs and release child 1664 // via child's b_go 1665 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 1666 } 1667 ngo_sync(); 1668 } 1669 TCW_8(thr_bar->b_go, 1670 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1671 // Now, release leaf children 1672 if (thr_bar->leaf_kids) { // if there are any 1673 // We test team_change on the off-chance that the level 1 team changed. 1674 if (team_change || 1675 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new 1676 if (old_leaf_kids) { // release old leaf kids 1677 thr_bar->b_go |= old_leaf_state; 1678 } 1679 // Release new leaf kids 1680 last = tid + thr_bar->skip_per_level[1]; 1681 if (last > nproc) 1682 last = nproc; 1683 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last; 1684 ++child_tid) { // skip_per_level[0]=1 1685 kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1686 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1687 KA_TRACE( 1688 20, 1689 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" 1690 " T#%d(%d:%d) go(%p): %u => %u\n", 1691 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1692 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 1693 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1694 // Release child using child's b_go flag 1695 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 1696 flag.release(); 1697 } 1698 } else { // Release all children at once with leaf_state bits on my own 1699 // b_go flag 1700 thr_bar->b_go |= thr_bar->leaf_state; 1701 } 1702 } 1703 } else { // Blocktime is not infinite; do a simple hierarchical release 1704 for (int d = thr_bar->my_level - 1; d >= 0; 1705 --d) { // Release highest level threads first 1706 last = tid + thr_bar->skip_per_level[d + 1]; 1707 kmp_uint32 skip = thr_bar->skip_per_level[d]; 1708 if (last > nproc) 1709 last = nproc; 1710 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 1711 kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1712 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1713 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1714 "releasing T#%d(%d:%d) go(%p): %u => %u\n", 1715 gtid, team->t.t_id, tid, 1716 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1717 child_tid, &child_bar->b_go, child_bar->b_go, 1718 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1719 // Release child using child's b_go flag 1720 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 1721 flag.release(); 1722 } 1723 } 1724 } 1725 #if KMP_BARRIER_ICV_PUSH 1726 if (propagate_icvs && !KMP_MASTER_TID(tid)) 1727 // non-leaves copy ICVs from fixed ICVs to local dest 1728 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1729 &thr_bar->th_fixed_icvs); 1730 #endif // KMP_BARRIER_ICV_PUSH 1731 } 1732 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for " 1733 "barrier type %d\n", 1734 gtid, team->t.t_id, tid, bt)); 1735 } 1736 1737 // End of Barrier Algorithms 1738 1739 // type traits for cancellable value 1740 // if cancellable is true, then is_cancellable is a normal boolean variable 1741 // if cancellable is false, then is_cancellable is a compile time constant 1742 template <bool cancellable> struct is_cancellable {}; 1743 template <> struct is_cancellable<true> { 1744 bool value; 1745 is_cancellable() : value(false) {} 1746 is_cancellable(bool b) : value(b) {} 1747 is_cancellable &operator=(bool b) { 1748 value = b; 1749 return *this; 1750 } 1751 operator bool() const { return value; } 1752 }; 1753 template <> struct is_cancellable<false> { 1754 is_cancellable &operator=(bool b) { return *this; } 1755 constexpr operator bool() const { return false; } 1756 }; 1757 1758 // Internal function to do a barrier. 1759 /* If is_split is true, do a split barrier, otherwise, do a plain barrier 1760 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split 1761 barrier 1762 When cancellable = false, 1763 Returns 0 if primary thread, 1 if worker thread. 1764 When cancellable = true 1765 Returns 0 if not cancelled, 1 if cancelled. */ 1766 template <bool cancellable = false> 1767 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, 1768 size_t reduce_size, void *reduce_data, 1769 void (*reduce)(void *, void *)) { 1770 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier); 1771 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 1772 int tid = __kmp_tid_from_gtid(gtid); 1773 kmp_info_t *this_thr = __kmp_threads[gtid]; 1774 kmp_team_t *team = this_thr->th.th_team; 1775 int status = 0; 1776 is_cancellable<cancellable> cancelled; 1777 #if OMPT_SUPPORT && OMPT_OPTIONAL 1778 ompt_data_t *my_task_data; 1779 ompt_data_t *my_parallel_data; 1780 void *return_address; 1781 ompt_sync_region_t barrier_kind; 1782 #endif 1783 1784 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid, 1785 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1786 1787 #if OMPT_SUPPORT 1788 if (ompt_enabled.enabled) { 1789 #if OMPT_OPTIONAL 1790 my_task_data = OMPT_CUR_TASK_DATA(this_thr); 1791 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); 1792 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1793 barrier_kind = __ompt_get_barrier_kind(bt, this_thr); 1794 if (ompt_enabled.ompt_callback_sync_region) { 1795 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1796 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, 1797 return_address); 1798 } 1799 if (ompt_enabled.ompt_callback_sync_region_wait) { 1800 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1801 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, 1802 return_address); 1803 } 1804 #endif 1805 // It is OK to report the barrier state after the barrier begin callback. 1806 // According to the OMPT specification, a compliant implementation may 1807 // even delay reporting this state until the barrier begins to wait. 1808 auto *ompt_thr_info = &this_thr->th.ompt_thread_info; 1809 switch (barrier_kind) { 1810 case ompt_sync_region_barrier_explicit: 1811 ompt_thr_info->state = ompt_state_wait_barrier_explicit; 1812 break; 1813 case ompt_sync_region_barrier_implicit_workshare: 1814 ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare; 1815 break; 1816 case ompt_sync_region_barrier_implicit_parallel: 1817 ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel; 1818 break; 1819 case ompt_sync_region_barrier_teams: 1820 ompt_thr_info->state = ompt_state_wait_barrier_teams; 1821 break; 1822 case ompt_sync_region_barrier_implementation: 1823 [[fallthrough]]; 1824 default: 1825 ompt_thr_info->state = ompt_state_wait_barrier_implementation; 1826 } 1827 } 1828 #endif 1829 1830 if (!team->t.t_serialized) { 1831 #if USE_ITT_BUILD 1832 // This value will be used in itt notify events below. 1833 void *itt_sync_obj = NULL; 1834 #if USE_ITT_NOTIFY 1835 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1836 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1837 #endif 1838 #endif /* USE_ITT_BUILD */ 1839 if (__kmp_tasking_mode == tskm_extra_barrier) { 1840 __kmp_tasking_barrier(team, this_thr, gtid); 1841 KA_TRACE(15, 1842 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid, 1843 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1844 } 1845 1846 /* Copy the blocktime info to the thread, where __kmp_wait_template() can 1847 access it when the team struct is not guaranteed to exist. */ 1848 // See note about the corresponding code in __kmp_join_barrier() being 1849 // performance-critical. 1850 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1851 #if KMP_USE_MONITOR 1852 this_thr->th.th_team_bt_intervals = 1853 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1854 this_thr->th.th_team_bt_set = 1855 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1856 #else 1857 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 1858 #endif 1859 } 1860 1861 #if USE_ITT_BUILD 1862 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1863 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1864 #endif /* USE_ITT_BUILD */ 1865 #if USE_DEBUGGER 1866 // Let the debugger know: the thread arrived to the barrier and waiting. 1867 if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct 1868 team->t.t_bar[bt].b_master_arrived += 1; 1869 } else { 1870 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; 1871 } // if 1872 #endif /* USE_DEBUGGER */ 1873 if (reduce != NULL) { 1874 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 1875 this_thr->th.th_local.reduce_data = reduce_data; 1876 } 1877 1878 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) 1879 __kmp_task_team_setup(this_thr, team); 1880 1881 if (cancellable) { 1882 cancelled = __kmp_linear_barrier_gather_cancellable( 1883 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1884 } else { 1885 switch (__kmp_barrier_gather_pattern[bt]) { 1886 case bp_dist_bar: { 1887 __kmp_dist_barrier_gather(bt, this_thr, gtid, tid, 1888 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1889 break; 1890 } 1891 case bp_hyper_bar: { 1892 // don't set branch bits to 0; use linear 1893 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); 1894 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, 1895 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1896 break; 1897 } 1898 case bp_hierarchical_bar: { 1899 __kmp_hierarchical_barrier_gather( 1900 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1901 break; 1902 } 1903 case bp_tree_bar: { 1904 // don't set branch bits to 0; use linear 1905 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); 1906 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, 1907 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1908 break; 1909 } 1910 default: { 1911 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, 1912 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1913 } 1914 } 1915 } 1916 1917 KMP_MB(); 1918 1919 if (KMP_MASTER_TID(tid)) { 1920 status = 0; 1921 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { 1922 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1923 } 1924 #if USE_DEBUGGER 1925 // Let the debugger know: All threads are arrived and starting leaving the 1926 // barrier. 1927 team->t.t_bar[bt].b_team_arrived += 1; 1928 #endif 1929 1930 if (__kmp_omp_cancellation) { 1931 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request); 1932 // Reset cancellation flag for worksharing constructs 1933 if (cancel_request == cancel_loop || 1934 cancel_request == cancel_sections) { 1935 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq); 1936 } 1937 } 1938 #if USE_ITT_BUILD 1939 /* TODO: In case of split reduction barrier, primary thread may send 1940 acquired event early, before the final summation into the shared 1941 variable is done (final summation can be a long operation for array 1942 reductions). */ 1943 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1944 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1945 #endif /* USE_ITT_BUILD */ 1946 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1947 // Barrier - report frame end (only if active_level == 1) 1948 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 1949 __kmp_forkjoin_frames_mode && 1950 (this_thr->th.th_teams_microtask == NULL || // either not in teams 1951 this_thr->th.th_teams_size.nteams == 1) && // or inside single team 1952 team->t.t_active_level == 1) { 1953 ident_t *loc = __kmp_threads[gtid]->th.th_ident; 1954 kmp_uint64 cur_time = __itt_get_timestamp(); 1955 kmp_info_t **other_threads = team->t.t_threads; 1956 int nproc = this_thr->th.th_team_nproc; 1957 int i; 1958 switch (__kmp_forkjoin_frames_mode) { 1959 case 1: 1960 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1961 loc, nproc); 1962 this_thr->th.th_frame_time = cur_time; 1963 break; 1964 case 2: // AC 2015-01-19: currently does not work for hierarchical (to 1965 // be fixed) 1966 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1967 1, loc, nproc); 1968 break; 1969 case 3: 1970 if (__itt_metadata_add_ptr) { 1971 // Initialize with primary thread's wait time 1972 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1973 // Set arrive time to zero to be able to check it in 1974 // __kmp_invoke_task(); the same is done inside the loop below 1975 this_thr->th.th_bar_arrive_time = 0; 1976 for (i = 1; i < nproc; ++i) { 1977 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 1978 other_threads[i]->th.th_bar_arrive_time = 0; 1979 } 1980 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 1981 cur_time, delta, 1982 (kmp_uint64)(reduce != NULL)); 1983 } 1984 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1985 loc, nproc); 1986 this_thr->th.th_frame_time = cur_time; 1987 break; 1988 } 1989 } 1990 #endif /* USE_ITT_BUILD */ 1991 } else { 1992 status = 1; 1993 #if USE_ITT_BUILD 1994 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1995 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1996 #endif /* USE_ITT_BUILD */ 1997 } 1998 if ((status == 1 || !is_split) && !cancelled) { 1999 if (cancellable) { 2000 cancelled = __kmp_linear_barrier_release_cancellable( 2001 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2002 } else { 2003 switch (__kmp_barrier_release_pattern[bt]) { 2004 case bp_dist_bar: { 2005 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 2006 __kmp_dist_barrier_release(bt, this_thr, gtid, tid, 2007 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2008 break; 2009 } 2010 case bp_hyper_bar: { 2011 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 2012 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 2013 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2014 break; 2015 } 2016 case bp_hierarchical_bar: { 2017 __kmp_hierarchical_barrier_release( 2018 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2019 break; 2020 } 2021 case bp_tree_bar: { 2022 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 2023 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 2024 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2025 break; 2026 } 2027 default: { 2028 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 2029 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2030 } 2031 } 2032 } 2033 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { 2034 __kmp_task_team_sync(this_thr, team); 2035 } 2036 } 2037 2038 #if USE_ITT_BUILD 2039 /* GEH: TODO: Move this under if-condition above and also include in 2040 __kmp_end_split_barrier(). This will more accurately represent the actual 2041 release time of the threads for split barriers. */ 2042 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 2043 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 2044 #endif /* USE_ITT_BUILD */ 2045 } else { // Team is serialized. 2046 status = 0; 2047 if (__kmp_tasking_mode != tskm_immediate_exec) { 2048 if (this_thr->th.th_task_team != NULL) { 2049 #if USE_ITT_NOTIFY 2050 void *itt_sync_obj = NULL; 2051 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2052 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 2053 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 2054 } 2055 #endif 2056 2057 KMP_DEBUG_ASSERT( 2058 this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE || 2059 this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered == 2060 TRUE); 2061 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 2062 __kmp_task_team_setup(this_thr, team); 2063 2064 #if USE_ITT_BUILD 2065 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 2066 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 2067 #endif /* USE_ITT_BUILD */ 2068 } 2069 } 2070 } 2071 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n", 2072 gtid, __kmp_team_from_gtid(gtid)->t.t_id, 2073 __kmp_tid_from_gtid(gtid), status)); 2074 2075 #if OMPT_SUPPORT 2076 if (ompt_enabled.enabled) { 2077 #if OMPT_OPTIONAL 2078 if (ompt_enabled.ompt_callback_sync_region_wait) { 2079 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2080 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, 2081 return_address); 2082 } 2083 if (ompt_enabled.ompt_callback_sync_region) { 2084 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2085 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, 2086 return_address); 2087 } 2088 #endif 2089 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 2090 } 2091 #endif 2092 2093 if (cancellable) 2094 return (int)cancelled; 2095 return status; 2096 } 2097 2098 // Returns 0 if primary thread, 1 if worker thread. 2099 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split, 2100 size_t reduce_size, void *reduce_data, 2101 void (*reduce)(void *, void *)) { 2102 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data, 2103 reduce); 2104 } 2105 2106 #if defined(KMP_GOMP_COMPAT) 2107 // Returns 1 if cancelled, 0 otherwise 2108 int __kmp_barrier_gomp_cancel(int gtid) { 2109 if (__kmp_omp_cancellation) { 2110 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE, 2111 0, NULL, NULL); 2112 if (cancelled) { 2113 int tid = __kmp_tid_from_gtid(gtid); 2114 kmp_info_t *this_thr = __kmp_threads[gtid]; 2115 if (KMP_MASTER_TID(tid)) { 2116 // Primary thread does not need to revert anything 2117 } else { 2118 // Workers need to revert their private b_arrived flag 2119 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -= 2120 KMP_BARRIER_STATE_BUMP; 2121 } 2122 } 2123 return cancelled; 2124 } 2125 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2126 return FALSE; 2127 } 2128 #endif 2129 2130 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) { 2131 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier); 2132 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 2133 KMP_DEBUG_ASSERT(bt < bs_last_barrier); 2134 int tid = __kmp_tid_from_gtid(gtid); 2135 kmp_info_t *this_thr = __kmp_threads[gtid]; 2136 kmp_team_t *team = this_thr->th.th_team; 2137 2138 if (!team->t.t_serialized) { 2139 if (KMP_MASTER_GTID(gtid)) { 2140 switch (__kmp_barrier_release_pattern[bt]) { 2141 case bp_dist_bar: { 2142 __kmp_dist_barrier_release(bt, this_thr, gtid, tid, 2143 FALSE USE_ITT_BUILD_ARG(NULL)); 2144 break; 2145 } 2146 case bp_hyper_bar: { 2147 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 2148 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 2149 FALSE USE_ITT_BUILD_ARG(NULL)); 2150 break; 2151 } 2152 case bp_hierarchical_bar: { 2153 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, 2154 FALSE USE_ITT_BUILD_ARG(NULL)); 2155 break; 2156 } 2157 case bp_tree_bar: { 2158 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 2159 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 2160 FALSE USE_ITT_BUILD_ARG(NULL)); 2161 break; 2162 } 2163 default: { 2164 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 2165 FALSE USE_ITT_BUILD_ARG(NULL)); 2166 } 2167 } 2168 if (__kmp_tasking_mode != tskm_immediate_exec) { 2169 __kmp_task_team_sync(this_thr, team); 2170 } // if 2171 } 2172 } 2173 } 2174 2175 void __kmp_join_barrier(int gtid) { 2176 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier); 2177 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 2178 2179 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 2180 2181 kmp_info_t *this_thr = __kmp_threads[gtid]; 2182 kmp_team_t *team; 2183 int tid; 2184 #ifdef KMP_DEBUG 2185 int team_id; 2186 #endif /* KMP_DEBUG */ 2187 #if USE_ITT_BUILD 2188 void *itt_sync_obj = NULL; 2189 #if USE_ITT_NOTIFY 2190 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need 2191 // Get object created at fork_barrier 2192 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2193 #endif 2194 #endif /* USE_ITT_BUILD */ 2195 #if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG) 2196 int nproc = this_thr->th.th_team_nproc; 2197 #endif 2198 KMP_MB(); 2199 2200 // Get current info 2201 team = this_thr->th.th_team; 2202 KMP_DEBUG_ASSERT(nproc == team->t.t_nproc); 2203 tid = __kmp_tid_from_gtid(gtid); 2204 #ifdef KMP_DEBUG 2205 team_id = team->t.t_id; 2206 kmp_info_t *master_thread = this_thr->th.th_team_master; 2207 if (master_thread != team->t.t_threads[0]) { 2208 __kmp_print_structure(); 2209 } 2210 #endif /* KMP_DEBUG */ 2211 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); 2212 KMP_MB(); 2213 2214 // Verify state 2215 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); 2216 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); 2217 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); 2218 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", 2219 gtid, team_id, tid)); 2220 2221 #if OMPT_SUPPORT 2222 if (ompt_enabled.enabled) { 2223 #if OMPT_OPTIONAL 2224 ompt_data_t *my_task_data; 2225 ompt_data_t *my_parallel_data; 2226 void *codeptr = NULL; 2227 int ds_tid = this_thr->th.th_info.ds.ds_tid; 2228 if (KMP_MASTER_TID(ds_tid) && 2229 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 2230 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 2231 codeptr = team->t.ompt_team_info.master_return_address; 2232 my_task_data = OMPT_CUR_TASK_DATA(this_thr); 2233 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); 2234 ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel; 2235 ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel; 2236 if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) { 2237 sync_kind = ompt_sync_region_barrier_teams; 2238 ompt_state = ompt_state_wait_barrier_teams; 2239 } 2240 if (ompt_enabled.ompt_callback_sync_region) { 2241 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2242 sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr); 2243 } 2244 if (ompt_enabled.ompt_callback_sync_region_wait) { 2245 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2246 sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr); 2247 } 2248 if (!KMP_MASTER_TID(ds_tid)) 2249 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr); 2250 #endif 2251 this_thr->th.ompt_thread_info.state = ompt_state; 2252 } 2253 #endif 2254 2255 if (__kmp_tasking_mode == tskm_extra_barrier) { 2256 __kmp_tasking_barrier(team, this_thr, gtid); 2257 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n", 2258 gtid, team_id, tid)); 2259 } 2260 #ifdef KMP_DEBUG 2261 if (__kmp_tasking_mode != tskm_immediate_exec) { 2262 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = " 2263 "%p, th_task_team = %p\n", 2264 __kmp_gtid_from_thread(this_thr), team_id, 2265 team->t.t_task_team[this_thr->th.th_task_state], 2266 this_thr->th.th_task_team)); 2267 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr); 2268 } 2269 #endif /* KMP_DEBUG */ 2270 2271 /* Copy the blocktime info to the thread, where __kmp_wait_template() can 2272 access it when the team struct is not guaranteed to exist. Doing these 2273 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround, 2274 we do not perform the copy if blocktime=infinite, since the values are not 2275 used by __kmp_wait_template() in that case. */ 2276 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 2277 #if KMP_USE_MONITOR 2278 this_thr->th.th_team_bt_intervals = 2279 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 2280 this_thr->th.th_team_bt_set = 2281 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 2282 #else 2283 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 2284 #endif 2285 } 2286 2287 #if USE_ITT_BUILD 2288 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 2289 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 2290 #endif /* USE_ITT_BUILD */ 2291 2292 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { 2293 case bp_dist_bar: { 2294 __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 2295 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 2296 break; 2297 } 2298 case bp_hyper_bar: { 2299 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 2300 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 2301 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 2302 break; 2303 } 2304 case bp_hierarchical_bar: { 2305 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 2306 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 2307 break; 2308 } 2309 case bp_tree_bar: { 2310 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 2311 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 2312 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 2313 break; 2314 } 2315 default: { 2316 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 2317 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 2318 } 2319 } 2320 2321 /* From this point on, the team data structure may be deallocated at any time 2322 by the primary thread - it is unsafe to reference it in any of the worker 2323 threads. Any per-team data items that need to be referenced before the 2324 end of the barrier should be moved to the kmp_task_team_t structs. */ 2325 if (KMP_MASTER_TID(tid)) { 2326 if (__kmp_tasking_mode != tskm_immediate_exec) { 2327 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 2328 } 2329 if (__kmp_display_affinity) { 2330 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0); 2331 } 2332 #if KMP_STATS_ENABLED 2333 // Have primary thread flag the workers to indicate they are now waiting for 2334 // next parallel region, Also wake them up so they switch their timers to 2335 // idle. 2336 for (int i = 0; i < team->t.t_nproc; ++i) { 2337 kmp_info_t *team_thread = team->t.t_threads[i]; 2338 if (team_thread == this_thr) 2339 continue; 2340 team_thread->th.th_stats->setIdleFlag(); 2341 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && 2342 team_thread->th.th_sleep_loc != NULL) 2343 __kmp_null_resume_wrapper(team_thread); 2344 } 2345 #endif 2346 #if USE_ITT_BUILD 2347 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 2348 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 2349 #endif /* USE_ITT_BUILD */ 2350 2351 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2352 // Join barrier - report frame end 2353 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2354 __kmp_forkjoin_frames_mode && 2355 (this_thr->th.th_teams_microtask == NULL || // either not in teams 2356 this_thr->th.th_teams_size.nteams == 1) && // or inside single team 2357 team->t.t_active_level == 1) { 2358 kmp_uint64 cur_time = __itt_get_timestamp(); 2359 ident_t *loc = team->t.t_ident; 2360 kmp_info_t **other_threads = team->t.t_threads; 2361 switch (__kmp_forkjoin_frames_mode) { 2362 case 1: 2363 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 2364 loc, nproc); 2365 break; 2366 case 2: 2367 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, 2368 loc, nproc); 2369 break; 2370 case 3: 2371 if (__itt_metadata_add_ptr) { 2372 // Initialize with primary thread's wait time 2373 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 2374 // Set arrive time to zero to be able to check it in 2375 // __kmp_invoke_task(); the same is done inside the loop below 2376 this_thr->th.th_bar_arrive_time = 0; 2377 for (int i = 1; i < nproc; ++i) { 2378 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 2379 other_threads[i]->th.th_bar_arrive_time = 0; 2380 } 2381 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 2382 cur_time, delta, 0); 2383 } 2384 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 2385 loc, nproc); 2386 this_thr->th.th_frame_time = cur_time; 2387 break; 2388 } 2389 } 2390 #endif /* USE_ITT_BUILD */ 2391 } 2392 #if USE_ITT_BUILD 2393 else { 2394 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 2395 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 2396 } 2397 #endif /* USE_ITT_BUILD */ 2398 2399 #if KMP_DEBUG 2400 if (KMP_MASTER_TID(tid)) { 2401 KA_TRACE( 2402 15, 2403 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n", 2404 gtid, team_id, tid, nproc)); 2405 } 2406 #endif /* KMP_DEBUG */ 2407 2408 // TODO now, mark worker threads as done so they may be disbanded 2409 KMP_MB(); // Flush all pending memory write invalidates. 2410 KA_TRACE(10, 2411 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid)); 2412 2413 } 2414 2415 // TODO release worker threads' fork barriers as we are ready instead of all at 2416 // once 2417 void __kmp_fork_barrier(int gtid, int tid) { 2418 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier); 2419 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 2420 kmp_info_t *this_thr = __kmp_threads[gtid]; 2421 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; 2422 #if USE_ITT_BUILD 2423 void *itt_sync_obj = NULL; 2424 #endif /* USE_ITT_BUILD */ 2425 #ifdef KMP_DEBUG 2426 if (team) 2427 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid, 2428 (team != NULL) ? team->t.t_id : -1, tid)); 2429 #endif 2430 // th_team pointer only valid for primary thread here 2431 if (KMP_MASTER_TID(tid)) { 2432 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2433 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2434 // Create itt barrier object 2435 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1); 2436 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing 2437 } 2438 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2439 2440 #ifdef KMP_DEBUG 2441 KMP_DEBUG_ASSERT(team); 2442 kmp_info_t **other_threads = team->t.t_threads; 2443 int i; 2444 2445 // Verify state 2446 KMP_MB(); 2447 2448 for (i = 1; i < team->t.t_nproc; ++i) { 2449 KA_TRACE(500, 2450 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go " 2451 "== %u.\n", 2452 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, 2453 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, 2454 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); 2455 KMP_DEBUG_ASSERT( 2456 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) & 2457 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE); 2458 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); 2459 } 2460 #endif 2461 2462 if (__kmp_tasking_mode != tskm_immediate_exec) 2463 __kmp_task_team_setup(this_thr, team); 2464 2465 /* The primary thread may have changed its blocktime between join barrier 2466 and fork barrier. Copy the blocktime info to the thread, where 2467 __kmp_wait_template() can access it when the team struct is not 2468 guaranteed to exist. */ 2469 // See note about the corresponding code in __kmp_join_barrier() being 2470 // performance-critical 2471 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 2472 #if KMP_USE_MONITOR 2473 this_thr->th.th_team_bt_intervals = 2474 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 2475 this_thr->th.th_team_bt_set = 2476 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 2477 #else 2478 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 2479 #endif 2480 } 2481 } // primary thread 2482 2483 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { 2484 case bp_dist_bar: { 2485 __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 2486 TRUE USE_ITT_BUILD_ARG(NULL)); 2487 break; 2488 } 2489 case bp_hyper_bar: { 2490 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 2491 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 2492 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 2493 break; 2494 } 2495 case bp_hierarchical_bar: { 2496 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 2497 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 2498 break; 2499 } 2500 case bp_tree_bar: { 2501 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 2502 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 2503 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 2504 break; 2505 } 2506 default: { 2507 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 2508 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 2509 } 2510 } 2511 2512 #if OMPT_SUPPORT 2513 ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state; 2514 if (ompt_enabled.enabled && 2515 (ompt_state == ompt_state_wait_barrier_teams || 2516 ompt_state == ompt_state_wait_barrier_implicit_parallel)) { 2517 int ds_tid = this_thr->th.th_info.ds.ds_tid; 2518 ompt_data_t *task_data = (team) 2519 ? OMPT_CUR_TASK_DATA(this_thr) 2520 : &(this_thr->th.ompt_thread_info.task_data); 2521 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 2522 #if OMPT_OPTIONAL 2523 void *codeptr = NULL; 2524 if (KMP_MASTER_TID(ds_tid) && 2525 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 2526 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 2527 codeptr = team ? team->t.ompt_team_info.master_return_address : NULL; 2528 ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel; 2529 if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) 2530 sync_kind = ompt_sync_region_barrier_teams; 2531 if (ompt_enabled.ompt_callback_sync_region_wait) { 2532 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2533 sync_kind, ompt_scope_end, NULL, task_data, codeptr); 2534 } 2535 if (ompt_enabled.ompt_callback_sync_region) { 2536 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2537 sync_kind, ompt_scope_end, NULL, task_data, codeptr); 2538 } 2539 #endif 2540 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 2541 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2542 ompt_scope_end, NULL, task_data, 0, ds_tid, 2543 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 2544 } 2545 } 2546 #endif 2547 2548 // Early exit for reaping threads releasing forkjoin barrier 2549 if (TCR_4(__kmp_global.g.g_done)) { 2550 this_thr->th.th_task_team = NULL; 2551 2552 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2553 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2554 if (!KMP_MASTER_TID(tid)) { 2555 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2556 if (itt_sync_obj) 2557 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 2558 } 2559 } 2560 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2561 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid)); 2562 return; 2563 } 2564 2565 /* We can now assume that a valid team structure has been allocated by the 2566 primary thread and propagated to all worker threads. The current thread, 2567 however, may not be part of the team, so we can't blindly assume that the 2568 team pointer is non-null. */ 2569 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); 2570 KMP_DEBUG_ASSERT(team != NULL); 2571 tid = __kmp_tid_from_gtid(gtid); 2572 2573 #if KMP_BARRIER_ICV_PULL 2574 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in 2575 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's 2576 implicit task has this data before this function is called. We cannot 2577 modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's 2578 thread struct, because it is not always the case that the threads arrays 2579 have been allocated when __kmp_fork_call() is executed. */ 2580 { 2581 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 2582 if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs 2583 // Copy the initial ICVs from the primary thread's thread struct to the 2584 // implicit task for this tid. 2585 KA_TRACE(10, 2586 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); 2587 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, 2588 tid, FALSE); 2589 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 2590 &team->t.t_threads[0] 2591 ->th.th_bar[bs_forkjoin_barrier] 2592 .bb.th_fixed_icvs); 2593 } 2594 } 2595 #endif // KMP_BARRIER_ICV_PULL 2596 2597 if (__kmp_tasking_mode != tskm_immediate_exec) { 2598 __kmp_task_team_sync(this_thr, team); 2599 } 2600 2601 #if KMP_AFFINITY_SUPPORTED 2602 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 2603 if (proc_bind == proc_bind_intel) { 2604 // Call dynamic affinity settings 2605 if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) { 2606 __kmp_balanced_affinity(this_thr, team->t.t_nproc); 2607 } 2608 } else if (proc_bind != proc_bind_false) { 2609 if (this_thr->th.th_new_place == this_thr->th.th_current_place) { 2610 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n", 2611 __kmp_gtid_from_thread(this_thr), 2612 this_thr->th.th_current_place)); 2613 } else { 2614 __kmp_affinity_bind_place(gtid); 2615 } 2616 } 2617 #endif // KMP_AFFINITY_SUPPORTED 2618 // Perform the display affinity functionality 2619 if (__kmp_display_affinity) { 2620 if (team->t.t_display_affinity 2621 #if KMP_AFFINITY_SUPPORTED 2622 || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) 2623 #endif 2624 ) { 2625 // NULL means use the affinity-format-var ICV 2626 __kmp_aux_display_affinity(gtid, NULL); 2627 this_thr->th.th_prev_num_threads = team->t.t_nproc; 2628 this_thr->th.th_prev_level = team->t.t_level; 2629 } 2630 } 2631 if (!KMP_MASTER_TID(tid)) 2632 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator); 2633 2634 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2635 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2636 if (!KMP_MASTER_TID(tid)) { 2637 // Get correct barrier object 2638 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2639 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired 2640 } // (prepare called inside barrier_release) 2641 } 2642 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2643 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, 2644 team->t.t_id, tid)); 2645 } 2646 2647 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 2648 kmp_internal_control_t *new_icvs, ident_t *loc) { 2649 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy); 2650 2651 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); 2652 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 2653 2654 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in 2655 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's 2656 implicit task has this data before this function is called. */ 2657 #if KMP_BARRIER_ICV_PULL 2658 /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which 2659 remains untouched), where all of the worker threads can access them and 2660 make their own copies after the barrier. */ 2661 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 2662 // allocated at this point 2663 copy_icvs( 2664 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, 2665 new_icvs); 2666 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0, 2667 team->t.t_threads[0], team)); 2668 #elif KMP_BARRIER_ICV_PUSH 2669 // The ICVs will be propagated in the fork barrier, so nothing needs to be 2670 // done here. 2671 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0, 2672 team->t.t_threads[0], team)); 2673 #else 2674 // Copy the ICVs to each of the non-primary threads. This takes O(nthreads) 2675 // time. 2676 ngo_load(new_icvs); 2677 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 2678 // allocated at this point 2679 for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread 2680 // TODO: GEH - pass in better source location info since usually NULL here 2681 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 2682 f, team->t.t_threads[f], team)); 2683 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); 2684 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); 2685 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 2686 f, team->t.t_threads[f], team)); 2687 } 2688 ngo_sync(); 2689 #endif // KMP_BARRIER_ICV_PULL 2690 } 2691