1 /* 2 * kmp_barrier.cpp 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_wait_release.h" 15 #include "kmp_itt.h" 16 #include "kmp_os.h" 17 #include "kmp_stats.h" 18 #include "ompt-specific.h" 19 20 #if KMP_MIC 21 #include <immintrin.h> 22 #define USE_NGO_STORES 1 23 #endif // KMP_MIC 24 25 #include "tsan_annotations.h" 26 27 #if KMP_MIC && USE_NGO_STORES 28 // ICV copying 29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) 30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory") 33 #else 34 #define ngo_load(src) ((void)0) 35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) 36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) 37 #define ngo_sync() ((void)0) 38 #endif /* KMP_MIC && USE_NGO_STORES */ 39 40 void __kmp_print_structure(void); // Forward declaration 41 42 // ---------------------------- Barrier Algorithms ---------------------------- 43 44 // Linear Barrier 45 template <bool cancellable = false> 46 static bool __kmp_linear_barrier_gather_template( 47 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 48 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 49 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather); 50 kmp_team_t *team = this_thr->th.th_team; 51 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 52 kmp_info_t **other_threads = team->t.t_threads; 53 54 KA_TRACE( 55 20, 56 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 57 gtid, team->t.t_id, tid, bt)); 58 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 59 60 #if USE_ITT_BUILD && USE_ITT_NOTIFY 61 // Barrier imbalance - save arrive time to the thread 62 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 63 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 64 __itt_get_timestamp(); 65 } 66 #endif 67 // We now perform a linear reduction to signal that all of the threads have 68 // arrived. 69 if (!KMP_MASTER_TID(tid)) { 70 KA_TRACE(20, 71 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" 72 "arrived(%p): %llu => %llu\n", 73 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team), 74 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived, 75 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 76 // Mark arrival to master thread 77 /* After performing this write, a worker thread may not assume that the team 78 is valid any more - it could be deallocated by the master thread at any 79 time. */ 80 ANNOTATE_BARRIER_BEGIN(this_thr); 81 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]); 82 flag.release(); 83 } else { 84 kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; 85 int nproc = this_thr->th.th_team_nproc; 86 int i; 87 // Don't have to worry about sleep bit here or atomic since team setting 88 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; 89 90 // Collect all the worker team member threads. 91 for (i = 1; i < nproc; ++i) { 92 #if KMP_CACHE_MANAGE 93 // Prefetch next thread's arrived count 94 if (i + 1 < nproc) 95 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived); 96 #endif /* KMP_CACHE_MANAGE */ 97 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 98 "arrived(%p) == %llu\n", 99 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 100 team->t.t_id, i, 101 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); 102 103 // Wait for worker thread to arrive 104 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, 105 new_state); 106 if (cancellable) { 107 bool cancelled = flag.wait_cancellable_nosleep( 108 this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 109 if (cancelled) 110 return true; 111 } else { 112 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 113 } 114 ANNOTATE_BARRIER_END(other_threads[i]); 115 #if USE_ITT_BUILD && USE_ITT_NOTIFY 116 // Barrier imbalance - write min of the thread time and the other thread 117 // time to the thread. 118 if (__kmp_forkjoin_frames_mode == 2) { 119 this_thr->th.th_bar_min_time = KMP_MIN( 120 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time); 121 } 122 #endif 123 if (reduce) { 124 KA_TRACE(100, 125 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 126 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 127 team->t.t_id, i)); 128 ANNOTATE_REDUCE_AFTER(reduce); 129 OMPT_REDUCTION_DECL(this_thr, gtid); 130 OMPT_REDUCTION_BEGIN; 131 (*reduce)(this_thr->th.th_local.reduce_data, 132 other_threads[i]->th.th_local.reduce_data); 133 OMPT_REDUCTION_END; 134 ANNOTATE_REDUCE_BEFORE(reduce); 135 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 136 } 137 } 138 // Don't have to worry about sleep bit here or atomic since team setting 139 team_bar->b_arrived = new_state; 140 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d " 141 "arrived(%p) = %llu\n", 142 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, 143 new_state)); 144 } 145 KA_TRACE( 146 20, 147 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 148 gtid, team->t.t_id, tid, bt)); 149 return false; 150 } 151 152 template <bool cancellable = false> 153 static bool __kmp_linear_barrier_release_template( 154 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 155 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 156 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release); 157 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 158 kmp_team_t *team; 159 160 if (KMP_MASTER_TID(tid)) { 161 unsigned int i; 162 kmp_uint32 nproc = this_thr->th.th_team_nproc; 163 kmp_info_t **other_threads; 164 165 team = __kmp_threads[gtid]->th.th_team; 166 KMP_DEBUG_ASSERT(team != NULL); 167 other_threads = team->t.t_threads; 168 169 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for " 170 "barrier type %d\n", 171 gtid, team->t.t_id, tid, bt)); 172 173 if (nproc > 1) { 174 #if KMP_BARRIER_ICV_PUSH 175 { 176 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 177 if (propagate_icvs) { 178 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); 179 for (i = 1; i < nproc; ++i) { 180 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], 181 team, i, FALSE); 182 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, 183 &team->t.t_implicit_task_taskdata[0].td_icvs); 184 } 185 ngo_sync(); 186 } 187 } 188 #endif // KMP_BARRIER_ICV_PUSH 189 190 // Now, release all of the worker threads 191 for (i = 1; i < nproc; ++i) { 192 #if KMP_CACHE_MANAGE 193 // Prefetch next thread's go flag 194 if (i + 1 < nproc) 195 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go); 196 #endif /* KMP_CACHE_MANAGE */ 197 KA_TRACE( 198 20, 199 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " 200 "go(%p): %u => %u\n", 201 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid, 202 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go, 203 other_threads[i]->th.th_bar[bt].bb.b_go, 204 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); 205 ANNOTATE_BARRIER_BEGIN(other_threads[i]); 206 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, 207 other_threads[i]); 208 flag.release(); 209 } 210 } 211 } else { // Wait for the MASTER thread to release us 212 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", 213 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 214 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 215 if (cancellable) { 216 bool cancelled = flag.wait_cancellable_nosleep( 217 this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 218 if (cancelled) { 219 return true; 220 } 221 } else { 222 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 223 } 224 ANNOTATE_BARRIER_END(this_thr); 225 #if USE_ITT_BUILD && USE_ITT_NOTIFY 226 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 227 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is 228 // disabled) 229 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 230 // Cancel wait on previous parallel region... 231 __kmp_itt_task_starting(itt_sync_obj); 232 233 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 234 return false; 235 236 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 237 if (itt_sync_obj != NULL) 238 // Call prepare as early as possible for "new" barrier 239 __kmp_itt_task_finished(itt_sync_obj); 240 } else 241 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 242 // Early exit for reaping threads releasing forkjoin barrier 243 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 244 return false; 245 // The worker thread may now assume that the team is valid. 246 #ifdef KMP_DEBUG 247 tid = __kmp_tid_from_gtid(gtid); 248 team = __kmp_threads[gtid]->th.th_team; 249 #endif 250 KMP_DEBUG_ASSERT(team != NULL); 251 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 252 KA_TRACE(20, 253 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 254 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 255 KMP_MB(); // Flush all pending memory write invalidates. 256 } 257 KA_TRACE( 258 20, 259 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 260 gtid, team->t.t_id, tid, bt)); 261 return false; 262 } 263 264 static void __kmp_linear_barrier_gather( 265 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 266 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 267 __kmp_linear_barrier_gather_template<false>( 268 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 269 } 270 271 static bool __kmp_linear_barrier_gather_cancellable( 272 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 273 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 274 return __kmp_linear_barrier_gather_template<true>( 275 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 276 } 277 278 static void __kmp_linear_barrier_release( 279 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 280 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 281 __kmp_linear_barrier_release_template<false>( 282 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); 283 } 284 285 static bool __kmp_linear_barrier_release_cancellable( 286 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 287 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 288 return __kmp_linear_barrier_release_template<true>( 289 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); 290 } 291 292 // Tree barrier 293 static void 294 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, 295 int tid, void (*reduce)(void *, void *) 296 USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 297 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather); 298 kmp_team_t *team = this_thr->th.th_team; 299 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 300 kmp_info_t **other_threads = team->t.t_threads; 301 kmp_uint32 nproc = this_thr->th.th_team_nproc; 302 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 303 kmp_uint32 branch_factor = 1 << branch_bits; 304 kmp_uint32 child; 305 kmp_uint32 child_tid; 306 kmp_uint64 new_state; 307 308 KA_TRACE( 309 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 310 gtid, team->t.t_id, tid, bt)); 311 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 312 313 #if USE_ITT_BUILD && USE_ITT_NOTIFY 314 // Barrier imbalance - save arrive time to the thread 315 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 316 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 317 __itt_get_timestamp(); 318 } 319 #endif 320 // Perform tree gather to wait until all threads have arrived; reduce any 321 // required data as we go 322 child_tid = (tid << branch_bits) + 1; 323 if (child_tid < nproc) { 324 // Parent threads wait for all their children to arrive 325 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 326 child = 1; 327 do { 328 kmp_info_t *child_thr = other_threads[child_tid]; 329 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 330 #if KMP_CACHE_MANAGE 331 // Prefetch next thread's arrived count 332 if (child + 1 <= branch_factor && child_tid + 1 < nproc) 333 KMP_CACHE_PREFETCH( 334 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived); 335 #endif /* KMP_CACHE_MANAGE */ 336 KA_TRACE(20, 337 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 338 "arrived(%p) == %llu\n", 339 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 340 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 341 // Wait for child to arrive 342 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 343 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 344 ANNOTATE_BARRIER_END(child_thr); 345 #if USE_ITT_BUILD && USE_ITT_NOTIFY 346 // Barrier imbalance - write min of the thread time and a child time to 347 // the thread. 348 if (__kmp_forkjoin_frames_mode == 2) { 349 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 350 child_thr->th.th_bar_min_time); 351 } 352 #endif 353 if (reduce) { 354 KA_TRACE(100, 355 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 356 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 357 team->t.t_id, child_tid)); 358 ANNOTATE_REDUCE_AFTER(reduce); 359 OMPT_REDUCTION_DECL(this_thr, gtid); 360 OMPT_REDUCTION_BEGIN; 361 (*reduce)(this_thr->th.th_local.reduce_data, 362 child_thr->th.th_local.reduce_data); 363 OMPT_REDUCTION_END; 364 ANNOTATE_REDUCE_BEFORE(reduce); 365 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 366 } 367 child++; 368 child_tid++; 369 } while (child <= branch_factor && child_tid < nproc); 370 } 371 372 if (!KMP_MASTER_TID(tid)) { // Worker threads 373 kmp_int32 parent_tid = (tid - 1) >> branch_bits; 374 375 KA_TRACE(20, 376 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 377 "arrived(%p): %llu => %llu\n", 378 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 379 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 380 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 381 382 // Mark arrival to parent thread 383 /* After performing this write, a worker thread may not assume that the team 384 is valid any more - it could be deallocated by the master thread at any 385 time. */ 386 ANNOTATE_BARRIER_BEGIN(this_thr); 387 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]); 388 flag.release(); 389 } else { 390 // Need to update the team arrived pointer if we are the master thread 391 if (nproc > 1) // New value was already computed above 392 team->t.t_bar[bt].b_arrived = new_state; 393 else 394 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 395 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d " 396 "arrived(%p) = %llu\n", 397 gtid, team->t.t_id, tid, team->t.t_id, 398 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 399 } 400 KA_TRACE(20, 401 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 402 gtid, team->t.t_id, tid, bt)); 403 } 404 405 static void __kmp_tree_barrier_release( 406 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 407 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 408 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release); 409 kmp_team_t *team; 410 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 411 kmp_uint32 nproc; 412 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 413 kmp_uint32 branch_factor = 1 << branch_bits; 414 kmp_uint32 child; 415 kmp_uint32 child_tid; 416 417 // Perform a tree release for all of the threads that have been gathered 418 if (!KMP_MASTER_TID( 419 tid)) { // Handle fork barrier workers who aren't part of a team yet 420 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid, 421 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 422 // Wait for parent thread to release us 423 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 424 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 425 ANNOTATE_BARRIER_END(this_thr); 426 #if USE_ITT_BUILD && USE_ITT_NOTIFY 427 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 428 // In fork barrier where we could not get the object reliably (or 429 // ITTNOTIFY is disabled) 430 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 431 // Cancel wait on previous parallel region... 432 __kmp_itt_task_starting(itt_sync_obj); 433 434 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 435 return; 436 437 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 438 if (itt_sync_obj != NULL) 439 // Call prepare as early as possible for "new" barrier 440 __kmp_itt_task_finished(itt_sync_obj); 441 } else 442 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 443 // Early exit for reaping threads releasing forkjoin barrier 444 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 445 return; 446 447 // The worker thread may now assume that the team is valid. 448 team = __kmp_threads[gtid]->th.th_team; 449 KMP_DEBUG_ASSERT(team != NULL); 450 tid = __kmp_tid_from_gtid(gtid); 451 452 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 453 KA_TRACE(20, 454 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid, 455 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 456 KMP_MB(); // Flush all pending memory write invalidates. 457 } else { 458 team = __kmp_threads[gtid]->th.th_team; 459 KMP_DEBUG_ASSERT(team != NULL); 460 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for " 461 "barrier type %d\n", 462 gtid, team->t.t_id, tid, bt)); 463 } 464 nproc = this_thr->th.th_team_nproc; 465 child_tid = (tid << branch_bits) + 1; 466 467 if (child_tid < nproc) { 468 kmp_info_t **other_threads = team->t.t_threads; 469 child = 1; 470 // Parent threads release all their children 471 do { 472 kmp_info_t *child_thr = other_threads[child_tid]; 473 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 474 #if KMP_CACHE_MANAGE 475 // Prefetch next thread's go count 476 if (child + 1 <= branch_factor && child_tid + 1 < nproc) 477 KMP_CACHE_PREFETCH( 478 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go); 479 #endif /* KMP_CACHE_MANAGE */ 480 481 #if KMP_BARRIER_ICV_PUSH 482 { 483 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 484 if (propagate_icvs) { 485 __kmp_init_implicit_task(team->t.t_ident, 486 team->t.t_threads[child_tid], team, 487 child_tid, FALSE); 488 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, 489 &team->t.t_implicit_task_taskdata[0].td_icvs); 490 } 491 } 492 #endif // KMP_BARRIER_ICV_PUSH 493 KA_TRACE(20, 494 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 495 "go(%p): %u => %u\n", 496 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 497 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 498 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 499 // Release child from barrier 500 ANNOTATE_BARRIER_BEGIN(child_thr); 501 kmp_flag_64 flag(&child_bar->b_go, child_thr); 502 flag.release(); 503 child++; 504 child_tid++; 505 } while (child <= branch_factor && child_tid < nproc); 506 } 507 KA_TRACE( 508 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 509 gtid, team->t.t_id, tid, bt)); 510 } 511 512 // Hyper Barrier 513 static void 514 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, 515 int tid, void (*reduce)(void *, void *) 516 USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 517 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather); 518 kmp_team_t *team = this_thr->th.th_team; 519 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 520 kmp_info_t **other_threads = team->t.t_threads; 521 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE; 522 kmp_uint32 num_threads = this_thr->th.th_team_nproc; 523 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 524 kmp_uint32 branch_factor = 1 << branch_bits; 525 kmp_uint32 offset; 526 kmp_uint32 level; 527 528 KA_TRACE( 529 20, 530 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 531 gtid, team->t.t_id, tid, bt)); 532 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 533 534 #if USE_ITT_BUILD && USE_ITT_NOTIFY 535 // Barrier imbalance - save arrive time to the thread 536 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 537 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 538 __itt_get_timestamp(); 539 } 540 #endif 541 /* Perform a hypercube-embedded tree gather to wait until all of the threads 542 have arrived, and reduce any required data as we go. */ 543 kmp_flag_64 p_flag(&thr_bar->b_arrived); 544 for (level = 0, offset = 1; offset < num_threads; 545 level += branch_bits, offset <<= branch_bits) { 546 kmp_uint32 child; 547 kmp_uint32 child_tid; 548 549 if (((tid >> level) & (branch_factor - 1)) != 0) { 550 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1); 551 552 KA_TRACE(20, 553 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 554 "arrived(%p): %llu => %llu\n", 555 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 556 team->t.t_id, parent_tid, &thr_bar->b_arrived, 557 thr_bar->b_arrived, 558 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 559 // Mark arrival to parent thread 560 /* After performing this write (in the last iteration of the enclosing for 561 loop), a worker thread may not assume that the team is valid any more 562 - it could be deallocated by the master thread at any time. */ 563 ANNOTATE_BARRIER_BEGIN(this_thr); 564 p_flag.set_waiter(other_threads[parent_tid]); 565 p_flag.release(); 566 break; 567 } 568 569 // Parent threads wait for children to arrive 570 if (new_state == KMP_BARRIER_UNUSED_STATE) 571 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 572 for (child = 1, child_tid = tid + (1 << level); 573 child < branch_factor && child_tid < num_threads; 574 child++, child_tid += (1 << level)) { 575 kmp_info_t *child_thr = other_threads[child_tid]; 576 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 577 #if KMP_CACHE_MANAGE 578 kmp_uint32 next_child_tid = child_tid + (1 << level); 579 // Prefetch next thread's arrived count 580 if (child + 1 < branch_factor && next_child_tid < num_threads) 581 KMP_CACHE_PREFETCH( 582 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); 583 #endif /* KMP_CACHE_MANAGE */ 584 KA_TRACE(20, 585 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 586 "arrived(%p) == %llu\n", 587 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 588 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 589 // Wait for child to arrive 590 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state); 591 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 592 ANNOTATE_BARRIER_END(child_thr); 593 #if USE_ITT_BUILD && USE_ITT_NOTIFY 594 // Barrier imbalance - write min of the thread time and a child time to 595 // the thread. 596 if (__kmp_forkjoin_frames_mode == 2) { 597 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 598 child_thr->th.th_bar_min_time); 599 } 600 #endif 601 if (reduce) { 602 KA_TRACE(100, 603 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 604 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 605 team->t.t_id, child_tid)); 606 ANNOTATE_REDUCE_AFTER(reduce); 607 OMPT_REDUCTION_DECL(this_thr, gtid); 608 OMPT_REDUCTION_BEGIN; 609 (*reduce)(this_thr->th.th_local.reduce_data, 610 child_thr->th.th_local.reduce_data); 611 OMPT_REDUCTION_END; 612 ANNOTATE_REDUCE_BEFORE(reduce); 613 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 614 } 615 } 616 } 617 618 if (KMP_MASTER_TID(tid)) { 619 // Need to update the team arrived pointer if we are the master thread 620 if (new_state == KMP_BARRIER_UNUSED_STATE) 621 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 622 else 623 team->t.t_bar[bt].b_arrived = new_state; 624 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d " 625 "arrived(%p) = %llu\n", 626 gtid, team->t.t_id, tid, team->t.t_id, 627 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 628 } 629 KA_TRACE( 630 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 631 gtid, team->t.t_id, tid, bt)); 632 } 633 634 // The reverse versions seem to beat the forward versions overall 635 #define KMP_REVERSE_HYPER_BAR 636 static void __kmp_hyper_barrier_release( 637 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 638 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 639 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release); 640 kmp_team_t *team; 641 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 642 kmp_info_t **other_threads; 643 kmp_uint32 num_threads; 644 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 645 kmp_uint32 branch_factor = 1 << branch_bits; 646 kmp_uint32 child; 647 kmp_uint32 child_tid; 648 kmp_uint32 offset; 649 kmp_uint32 level; 650 651 /* Perform a hypercube-embedded tree release for all of the threads that have 652 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads 653 are released in the reverse order of the corresponding gather, otherwise 654 threads are released in the same order. */ 655 if (KMP_MASTER_TID(tid)) { // master 656 team = __kmp_threads[gtid]->th.th_team; 657 KMP_DEBUG_ASSERT(team != NULL); 658 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for " 659 "barrier type %d\n", 660 gtid, team->t.t_id, tid, bt)); 661 #if KMP_BARRIER_ICV_PUSH 662 if (propagate_icvs) { // master already has ICVs in final destination; copy 663 copy_icvs(&thr_bar->th_fixed_icvs, 664 &team->t.t_implicit_task_taskdata[tid].td_icvs); 665 } 666 #endif 667 } else { // Handle fork barrier workers who aren't part of a team yet 668 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid, 669 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 670 // Wait for parent thread to release us 671 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 672 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 673 ANNOTATE_BARRIER_END(this_thr); 674 #if USE_ITT_BUILD && USE_ITT_NOTIFY 675 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 676 // In fork barrier where we could not get the object reliably 677 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 678 // Cancel wait on previous parallel region... 679 __kmp_itt_task_starting(itt_sync_obj); 680 681 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 682 return; 683 684 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 685 if (itt_sync_obj != NULL) 686 // Call prepare as early as possible for "new" barrier 687 __kmp_itt_task_finished(itt_sync_obj); 688 } else 689 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 690 // Early exit for reaping threads releasing forkjoin barrier 691 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 692 return; 693 694 // The worker thread may now assume that the team is valid. 695 team = __kmp_threads[gtid]->th.th_team; 696 KMP_DEBUG_ASSERT(team != NULL); 697 tid = __kmp_tid_from_gtid(gtid); 698 699 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 700 KA_TRACE(20, 701 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 702 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 703 KMP_MB(); // Flush all pending memory write invalidates. 704 } 705 num_threads = this_thr->th.th_team_nproc; 706 other_threads = team->t.t_threads; 707 708 #ifdef KMP_REVERSE_HYPER_BAR 709 // Count up to correct level for parent 710 for (level = 0, offset = 1; 711 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0); 712 level += branch_bits, offset <<= branch_bits) 713 ; 714 715 // Now go down from there 716 for (level -= branch_bits, offset >>= branch_bits; offset != 0; 717 level -= branch_bits, offset >>= branch_bits) 718 #else 719 // Go down the tree, level by level 720 for (level = 0, offset = 1; offset < num_threads; 721 level += branch_bits, offset <<= branch_bits) 722 #endif // KMP_REVERSE_HYPER_BAR 723 { 724 #ifdef KMP_REVERSE_HYPER_BAR 725 /* Now go in reverse order through the children, highest to lowest. 726 Initial setting of child is conservative here. */ 727 child = num_threads >> ((level == 0) ? level : level - 1); 728 for (child = (child < branch_factor - 1) ? child : branch_factor - 1, 729 child_tid = tid + (child << level); 730 child >= 1; child--, child_tid -= (1 << level)) 731 #else 732 if (((tid >> level) & (branch_factor - 1)) != 0) 733 // No need to go lower than this, since this is the level parent would be 734 // notified 735 break; 736 // Iterate through children on this level of the tree 737 for (child = 1, child_tid = tid + (1 << level); 738 child < branch_factor && child_tid < num_threads; 739 child++, child_tid += (1 << level)) 740 #endif // KMP_REVERSE_HYPER_BAR 741 { 742 if (child_tid >= num_threads) 743 continue; // Child doesn't exist so keep going 744 else { 745 kmp_info_t *child_thr = other_threads[child_tid]; 746 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 747 #if KMP_CACHE_MANAGE 748 kmp_uint32 next_child_tid = child_tid - (1 << level); 749 // Prefetch next thread's go count 750 #ifdef KMP_REVERSE_HYPER_BAR 751 if (child - 1 >= 1 && next_child_tid < num_threads) 752 #else 753 if (child + 1 < branch_factor && next_child_tid < num_threads) 754 #endif // KMP_REVERSE_HYPER_BAR 755 KMP_CACHE_PREFETCH( 756 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); 757 #endif /* KMP_CACHE_MANAGE */ 758 759 #if KMP_BARRIER_ICV_PUSH 760 if (propagate_icvs) // push my fixed ICVs to my child 761 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 762 #endif // KMP_BARRIER_ICV_PUSH 763 764 KA_TRACE( 765 20, 766 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 767 "go(%p): %u => %u\n", 768 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 769 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 770 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 771 // Release child from barrier 772 ANNOTATE_BARRIER_BEGIN(child_thr); 773 kmp_flag_64 flag(&child_bar->b_go, child_thr); 774 flag.release(); 775 } 776 } 777 } 778 #if KMP_BARRIER_ICV_PUSH 779 if (propagate_icvs && 780 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest 781 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 782 FALSE); 783 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 784 &thr_bar->th_fixed_icvs); 785 } 786 #endif 787 KA_TRACE( 788 20, 789 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 790 gtid, team->t.t_id, tid, bt)); 791 } 792 793 // Hierarchical Barrier 794 795 // Initialize thread barrier data 796 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. 797 Performs the minimum amount of initialization required based on how the team 798 has changed. Returns true if leaf children will require both on-core and 799 traditional wake-up mechanisms. For example, if the team size increases, 800 threads already in the team will respond to on-core wakeup on their parent 801 thread, but threads newly added to the team will only be listening on the 802 their local b_go. */ 803 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, 804 kmp_bstate_t *thr_bar, 805 kmp_uint32 nproc, int gtid, 806 int tid, kmp_team_t *team) { 807 // Checks to determine if (re-)initialization is needed 808 bool uninitialized = thr_bar->team == NULL; 809 bool team_changed = team != thr_bar->team; 810 bool team_sz_changed = nproc != thr_bar->nproc; 811 bool tid_changed = tid != thr_bar->old_tid; 812 bool retval = false; 813 814 if (uninitialized || team_sz_changed) { 815 __kmp_get_hierarchy(nproc, thr_bar); 816 } 817 818 if (uninitialized || team_sz_changed || tid_changed) { 819 thr_bar->my_level = thr_bar->depth - 1; // default for master 820 thr_bar->parent_tid = -1; // default for master 821 if (!KMP_MASTER_TID( 822 tid)) { // if not master, find parent thread in hierarchy 823 kmp_uint32 d = 0; 824 while (d < thr_bar->depth) { // find parent based on level of thread in 825 // hierarchy, and note level 826 kmp_uint32 rem; 827 if (d == thr_bar->depth - 2) { // reached level right below the master 828 thr_bar->parent_tid = 0; 829 thr_bar->my_level = d; 830 break; 831 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 832 0) { // TODO: can we make this op faster? 833 // thread is not a subtree root at next level, so this is max 834 thr_bar->parent_tid = tid - rem; 835 thr_bar->my_level = d; 836 break; 837 } 838 ++d; 839 } 840 } 841 thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1); 842 thr_bar->old_tid = tid; 843 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 844 thr_bar->team = team; 845 thr_bar->parent_bar = 846 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 847 } 848 if (uninitialized || team_changed || tid_changed) { 849 thr_bar->team = team; 850 thr_bar->parent_bar = 851 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 852 retval = true; 853 } 854 if (uninitialized || team_sz_changed || tid_changed) { 855 thr_bar->nproc = nproc; 856 thr_bar->leaf_kids = thr_bar->base_leaf_kids; 857 if (thr_bar->my_level == 0) 858 thr_bar->leaf_kids = 0; 859 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc) 860 thr_bar->leaf_kids = nproc - tid - 1; 861 thr_bar->leaf_state = 0; 862 for (int i = 0; i < thr_bar->leaf_kids; ++i) 863 ((char *)&(thr_bar->leaf_state))[7 - i] = 1; 864 } 865 return retval; 866 } 867 868 static void __kmp_hierarchical_barrier_gather( 869 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 870 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 871 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather); 872 kmp_team_t *team = this_thr->th.th_team; 873 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 874 kmp_uint32 nproc = this_thr->th.th_team_nproc; 875 kmp_info_t **other_threads = team->t.t_threads; 876 kmp_uint64 new_state; 877 878 int level = team->t.t_level; 879 if (other_threads[0] 880 ->th.th_teams_microtask) // are we inside the teams construct? 881 if (this_thr->th.th_teams_size.nteams > 1) 882 ++level; // level was not increased in teams construct for team_of_masters 883 if (level == 1) 884 thr_bar->use_oncore_barrier = 1; 885 else 886 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 887 888 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for " 889 "barrier type %d\n", 890 gtid, team->t.t_id, tid, bt)); 891 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 892 893 #if USE_ITT_BUILD && USE_ITT_NOTIFY 894 // Barrier imbalance - save arrive time to the thread 895 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 896 this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); 897 } 898 #endif 899 900 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, 901 team); 902 903 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) 904 kmp_int32 child_tid; 905 new_state = 906 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 907 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 908 thr_bar->use_oncore_barrier) { 909 if (thr_bar->leaf_kids) { 910 // First, wait for leaf children to check-in on my b_arrived flag 911 kmp_uint64 leaf_state = 912 KMP_MASTER_TID(tid) 913 ? thr_bar->b_arrived | thr_bar->leaf_state 914 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; 915 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting " 916 "for leaf kids\n", 917 gtid, team->t.t_id, tid)); 918 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state); 919 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 920 if (reduce) { 921 ANNOTATE_REDUCE_AFTER(reduce); 922 OMPT_REDUCTION_DECL(this_thr, gtid); 923 OMPT_REDUCTION_BEGIN; 924 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids; 925 ++child_tid) { 926 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 927 "T#%d(%d:%d)\n", 928 gtid, team->t.t_id, tid, 929 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 930 child_tid)); 931 ANNOTATE_BARRIER_END(other_threads[child_tid]); 932 (*reduce)(this_thr->th.th_local.reduce_data, 933 other_threads[child_tid]->th.th_local.reduce_data); 934 } 935 OMPT_REDUCTION_END; 936 ANNOTATE_REDUCE_BEFORE(reduce); 937 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 938 } 939 // clear leaf_state bits 940 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state)); 941 } 942 // Next, wait for higher level children on each child's b_arrived flag 943 for (kmp_uint32 d = 1; d < thr_bar->my_level; 944 ++d) { // gather lowest level threads first, but skip 0 945 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 946 skip = thr_bar->skip_per_level[d]; 947 if (last > nproc) 948 last = nproc; 949 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 950 kmp_info_t *child_thr = other_threads[child_tid]; 951 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 952 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 953 "T#%d(%d:%d) " 954 "arrived(%p) == %llu\n", 955 gtid, team->t.t_id, tid, 956 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 957 child_tid, &child_bar->b_arrived, new_state)); 958 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 959 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 960 ANNOTATE_BARRIER_END(child_thr); 961 if (reduce) { 962 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 963 "T#%d(%d:%d)\n", 964 gtid, team->t.t_id, tid, 965 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 966 child_tid)); 967 ANNOTATE_REDUCE_AFTER(reduce); 968 (*reduce)(this_thr->th.th_local.reduce_data, 969 child_thr->th.th_local.reduce_data); 970 ANNOTATE_REDUCE_BEFORE(reduce); 971 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 972 } 973 } 974 } 975 } else { // Blocktime is not infinite 976 for (kmp_uint32 d = 0; d < thr_bar->my_level; 977 ++d) { // Gather lowest level threads first 978 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 979 skip = thr_bar->skip_per_level[d]; 980 if (last > nproc) 981 last = nproc; 982 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 983 kmp_info_t *child_thr = other_threads[child_tid]; 984 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 985 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 986 "T#%d(%d:%d) " 987 "arrived(%p) == %llu\n", 988 gtid, team->t.t_id, tid, 989 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 990 child_tid, &child_bar->b_arrived, new_state)); 991 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 992 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 993 ANNOTATE_BARRIER_END(child_thr); 994 if (reduce) { 995 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 996 "T#%d(%d:%d)\n", 997 gtid, team->t.t_id, tid, 998 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 999 child_tid)); 1000 ANNOTATE_REDUCE_AFTER(reduce); 1001 (*reduce)(this_thr->th.th_local.reduce_data, 1002 child_thr->th.th_local.reduce_data); 1003 ANNOTATE_REDUCE_BEFORE(reduce); 1004 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 1005 } 1006 } 1007 } 1008 } 1009 } 1010 // All subordinates are gathered; now release parent if not master thread 1011 1012 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy 1013 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing" 1014 " T#%d(%d:%d) arrived(%p): %llu => %llu\n", 1015 gtid, team->t.t_id, tid, 1016 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, 1017 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 1018 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 1019 /* Mark arrival to parent: After performing this write, a worker thread may 1020 not assume that the team is valid any more - it could be deallocated by 1021 the master thread at any time. */ 1022 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || 1023 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived 1024 // flag; release it 1025 ANNOTATE_BARRIER_BEGIN(this_thr); 1026 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]); 1027 flag.release(); 1028 } else { 1029 // Leaf does special release on "offset" bits of parent's b_arrived flag 1030 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 1031 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset); 1032 flag.set_waiter(other_threads[thr_bar->parent_tid]); 1033 flag.release(); 1034 } 1035 } else { // Master thread needs to update the team's b_arrived value 1036 team->t.t_bar[bt].b_arrived = new_state; 1037 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d " 1038 "arrived(%p) = %llu\n", 1039 gtid, team->t.t_id, tid, team->t.t_id, 1040 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 1041 } 1042 // Is the team access below unsafe or just technically invalid? 1043 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for " 1044 "barrier type %d\n", 1045 gtid, team->t.t_id, tid, bt)); 1046 } 1047 1048 static void __kmp_hierarchical_barrier_release( 1049 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1050 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 1051 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release); 1052 kmp_team_t *team; 1053 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 1054 kmp_uint32 nproc; 1055 bool team_change = false; // indicates on-core barrier shouldn't be used 1056 1057 if (KMP_MASTER_TID(tid)) { 1058 team = __kmp_threads[gtid]->th.th_team; 1059 KMP_DEBUG_ASSERT(team != NULL); 1060 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master " 1061 "entered barrier type %d\n", 1062 gtid, team->t.t_id, tid, bt)); 1063 } else { // Worker threads 1064 // Wait for parent thread to release me 1065 if (!thr_bar->use_oncore_barrier || 1066 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 || 1067 thr_bar->team == NULL) { 1068 // Use traditional method of waiting on my own b_go flag 1069 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; 1070 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 1071 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1072 ANNOTATE_BARRIER_END(this_thr); 1073 TCW_8(thr_bar->b_go, 1074 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1075 } else { // Thread barrier data is initialized, this is a leaf, blocktime is 1076 // infinite, not nested 1077 // Wait on my "offset" bits on parent's b_go flag 1078 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; 1079 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, 1080 thr_bar->offset, bt, 1081 this_thr USE_ITT_BUILD_ARG(itt_sync_obj)); 1082 flag.wait(this_thr, TRUE); 1083 if (thr_bar->wait_flag == 1084 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go 1085 TCW_8(thr_bar->b_go, 1086 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1087 } else { // Reset my bits on parent's b_go flag 1088 (RCAST(volatile char *, 1089 &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0; 1090 } 1091 } 1092 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 1093 // Early exit for reaping threads releasing forkjoin barrier 1094 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 1095 return; 1096 // The worker thread may now assume that the team is valid. 1097 team = __kmp_threads[gtid]->th.th_team; 1098 KMP_DEBUG_ASSERT(team != NULL); 1099 tid = __kmp_tid_from_gtid(gtid); 1100 1101 KA_TRACE( 1102 20, 1103 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 1104 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 1105 KMP_MB(); // Flush all pending memory write invalidates. 1106 } 1107 1108 nproc = this_thr->th.th_team_nproc; 1109 int level = team->t.t_level; 1110 if (team->t.t_threads[0] 1111 ->th.th_teams_microtask) { // are we inside the teams construct? 1112 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 1113 this_thr->th.th_teams_level == level) 1114 ++level; // level was not increased in teams construct for team_of_workers 1115 if (this_thr->th.th_teams_size.nteams > 1) 1116 ++level; // level was not increased in teams construct for team_of_masters 1117 } 1118 if (level == 1) 1119 thr_bar->use_oncore_barrier = 1; 1120 else 1121 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 1122 1123 // If the team size has increased, we still communicate with old leaves via 1124 // oncore barrier. 1125 unsigned short int old_leaf_kids = thr_bar->leaf_kids; 1126 kmp_uint64 old_leaf_state = thr_bar->leaf_state; 1127 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, 1128 tid, team); 1129 // But if the entire team changes, we won't use oncore barrier at all 1130 if (team_change) 1131 old_leaf_kids = 0; 1132 1133 #if KMP_BARRIER_ICV_PUSH 1134 if (propagate_icvs) { 1135 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 1136 FALSE); 1137 if (KMP_MASTER_TID( 1138 tid)) { // master already has copy in final destination; copy 1139 copy_icvs(&thr_bar->th_fixed_icvs, 1140 &team->t.t_implicit_task_taskdata[tid].td_icvs); 1141 } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1142 thr_bar->use_oncore_barrier) { // optimization for inf blocktime 1143 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) 1144 // leaves (on-core children) pull parent's fixed ICVs directly to local 1145 // ICV store 1146 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1147 &thr_bar->parent_bar->th_fixed_icvs); 1148 // non-leaves will get ICVs piggybacked with b_go via NGO store 1149 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs 1150 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can 1151 // access 1152 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs); 1153 else // leaves copy parent's fixed ICVs directly to local ICV store 1154 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1155 &thr_bar->parent_bar->th_fixed_icvs); 1156 } 1157 } 1158 #endif // KMP_BARRIER_ICV_PUSH 1159 1160 // Now, release my children 1161 if (thr_bar->my_level) { // not a leaf 1162 kmp_int32 child_tid; 1163 kmp_uint32 last; 1164 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1165 thr_bar->use_oncore_barrier) { 1166 if (KMP_MASTER_TID(tid)) { // do a flat release 1167 // Set local b_go to bump children via NGO store of the cache line 1168 // containing IVCs and b_go. 1169 thr_bar->b_go = KMP_BARRIER_STATE_BUMP; 1170 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of 1171 // the cache line 1172 ngo_load(&thr_bar->th_fixed_icvs); 1173 // This loops over all the threads skipping only the leaf nodes in the 1174 // hierarchy 1175 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc; 1176 child_tid += thr_bar->skip_per_level[1]) { 1177 kmp_bstate_t *child_bar = 1178 &team->t.t_threads[child_tid]->th.th_bar[bt].bb; 1179 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1180 "releasing T#%d(%d:%d)" 1181 " go(%p): %u => %u\n", 1182 gtid, team->t.t_id, tid, 1183 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1184 child_tid, &child_bar->b_go, child_bar->b_go, 1185 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1186 // Use ngo store (if available) to both store ICVs and release child 1187 // via child's b_go 1188 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 1189 } 1190 ngo_sync(); 1191 } 1192 TCW_8(thr_bar->b_go, 1193 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1194 // Now, release leaf children 1195 if (thr_bar->leaf_kids) { // if there are any 1196 // We test team_change on the off-chance that the level 1 team changed. 1197 if (team_change || 1198 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new 1199 if (old_leaf_kids) { // release old leaf kids 1200 thr_bar->b_go |= old_leaf_state; 1201 } 1202 // Release new leaf kids 1203 last = tid + thr_bar->skip_per_level[1]; 1204 if (last > nproc) 1205 last = nproc; 1206 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last; 1207 ++child_tid) { // skip_per_level[0]=1 1208 kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1209 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1210 KA_TRACE( 1211 20, 1212 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" 1213 " T#%d(%d:%d) go(%p): %u => %u\n", 1214 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1215 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 1216 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1217 // Release child using child's b_go flag 1218 ANNOTATE_BARRIER_BEGIN(child_thr); 1219 kmp_flag_64 flag(&child_bar->b_go, child_thr); 1220 flag.release(); 1221 } 1222 } else { // Release all children at once with leaf_state bits on my own 1223 // b_go flag 1224 thr_bar->b_go |= thr_bar->leaf_state; 1225 } 1226 } 1227 } else { // Blocktime is not infinite; do a simple hierarchical release 1228 for (int d = thr_bar->my_level - 1; d >= 0; 1229 --d) { // Release highest level threads first 1230 last = tid + thr_bar->skip_per_level[d + 1]; 1231 kmp_uint32 skip = thr_bar->skip_per_level[d]; 1232 if (last > nproc) 1233 last = nproc; 1234 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 1235 kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1236 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1237 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1238 "releasing T#%d(%d:%d) go(%p): %u => %u\n", 1239 gtid, team->t.t_id, tid, 1240 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1241 child_tid, &child_bar->b_go, child_bar->b_go, 1242 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1243 // Release child using child's b_go flag 1244 ANNOTATE_BARRIER_BEGIN(child_thr); 1245 kmp_flag_64 flag(&child_bar->b_go, child_thr); 1246 flag.release(); 1247 } 1248 } 1249 } 1250 #if KMP_BARRIER_ICV_PUSH 1251 if (propagate_icvs && !KMP_MASTER_TID(tid)) 1252 // non-leaves copy ICVs from fixed ICVs to local dest 1253 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1254 &thr_bar->th_fixed_icvs); 1255 #endif // KMP_BARRIER_ICV_PUSH 1256 } 1257 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for " 1258 "barrier type %d\n", 1259 gtid, team->t.t_id, tid, bt)); 1260 } 1261 1262 // End of Barrier Algorithms 1263 1264 // type traits for cancellable value 1265 // if cancellable is true, then is_cancellable is a normal boolean variable 1266 // if cancellable is false, then is_cancellable is a compile time constant 1267 template <bool cancellable> struct is_cancellable {}; 1268 template <> struct is_cancellable<true> { 1269 bool value; 1270 is_cancellable() : value(false) {} 1271 is_cancellable(bool b) : value(b) {} 1272 is_cancellable &operator=(bool b) { 1273 value = b; 1274 return *this; 1275 } 1276 operator bool() const { return value; } 1277 }; 1278 template <> struct is_cancellable<false> { 1279 is_cancellable &operator=(bool b) { return *this; } 1280 constexpr operator bool() const { return false; } 1281 }; 1282 1283 // Internal function to do a barrier. 1284 /* If is_split is true, do a split barrier, otherwise, do a plain barrier 1285 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split 1286 barrier 1287 When cancellable = false, 1288 Returns 0 if master thread, 1 if worker thread. 1289 When cancellable = true 1290 Returns 0 if not cancelled, 1 if cancelled. */ 1291 template <bool cancellable = false> 1292 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, 1293 size_t reduce_size, void *reduce_data, 1294 void (*reduce)(void *, void *)) { 1295 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier); 1296 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 1297 int tid = __kmp_tid_from_gtid(gtid); 1298 kmp_info_t *this_thr = __kmp_threads[gtid]; 1299 kmp_team_t *team = this_thr->th.th_team; 1300 int status = 0; 1301 is_cancellable<cancellable> cancelled; 1302 #if OMPT_SUPPORT && OMPT_OPTIONAL 1303 ompt_data_t *my_task_data; 1304 ompt_data_t *my_parallel_data; 1305 void *return_address; 1306 ompt_sync_region_t barrier_kind; 1307 #endif 1308 1309 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid, 1310 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1311 1312 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); 1313 #if OMPT_SUPPORT 1314 if (ompt_enabled.enabled) { 1315 #if OMPT_OPTIONAL 1316 my_task_data = OMPT_CUR_TASK_DATA(this_thr); 1317 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); 1318 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1319 barrier_kind = __ompt_get_barrier_kind(bt, this_thr); 1320 if (ompt_enabled.ompt_callback_sync_region) { 1321 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1322 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, 1323 return_address); 1324 } 1325 if (ompt_enabled.ompt_callback_sync_region_wait) { 1326 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1327 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, 1328 return_address); 1329 } 1330 #endif 1331 // It is OK to report the barrier state after the barrier begin callback. 1332 // According to the OMPT specification, a compliant implementation may 1333 // even delay reporting this state until the barrier begins to wait. 1334 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; 1335 } 1336 #endif 1337 1338 if (!team->t.t_serialized) { 1339 #if USE_ITT_BUILD 1340 // This value will be used in itt notify events below. 1341 void *itt_sync_obj = NULL; 1342 #if USE_ITT_NOTIFY 1343 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1344 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1345 #endif 1346 #endif /* USE_ITT_BUILD */ 1347 if (__kmp_tasking_mode == tskm_extra_barrier) { 1348 __kmp_tasking_barrier(team, this_thr, gtid); 1349 KA_TRACE(15, 1350 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid, 1351 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1352 } 1353 1354 /* Copy the blocktime info to the thread, where __kmp_wait_template() can 1355 access it when the team struct is not guaranteed to exist. */ 1356 // See note about the corresponding code in __kmp_join_barrier() being 1357 // performance-critical. 1358 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1359 #if KMP_USE_MONITOR 1360 this_thr->th.th_team_bt_intervals = 1361 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1362 this_thr->th.th_team_bt_set = 1363 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1364 #else 1365 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 1366 #endif 1367 } 1368 1369 #if USE_ITT_BUILD 1370 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1371 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1372 #endif /* USE_ITT_BUILD */ 1373 #if USE_DEBUGGER 1374 // Let the debugger know: the thread arrived to the barrier and waiting. 1375 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure. 1376 team->t.t_bar[bt].b_master_arrived += 1; 1377 } else { 1378 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; 1379 } // if 1380 #endif /* USE_DEBUGGER */ 1381 if (reduce != NULL) { 1382 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 1383 this_thr->th.th_local.reduce_data = reduce_data; 1384 } 1385 1386 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) 1387 // use 0 to only setup the current team if nthreads > 1 1388 __kmp_task_team_setup(this_thr, team, 0); 1389 1390 if (cancellable) { 1391 cancelled = __kmp_linear_barrier_gather_cancellable( 1392 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1393 } else { 1394 switch (__kmp_barrier_gather_pattern[bt]) { 1395 case bp_hyper_bar: { 1396 // don't set branch bits to 0; use linear 1397 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); 1398 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, 1399 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1400 break; 1401 } 1402 case bp_hierarchical_bar: { 1403 __kmp_hierarchical_barrier_gather( 1404 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1405 break; 1406 } 1407 case bp_tree_bar: { 1408 // don't set branch bits to 0; use linear 1409 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); 1410 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, 1411 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1412 break; 1413 } 1414 default: { 1415 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, 1416 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1417 } 1418 } 1419 } 1420 1421 KMP_MB(); 1422 1423 if (KMP_MASTER_TID(tid)) { 1424 status = 0; 1425 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { 1426 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1427 } 1428 #if USE_DEBUGGER 1429 // Let the debugger know: All threads are arrived and starting leaving the 1430 // barrier. 1431 team->t.t_bar[bt].b_team_arrived += 1; 1432 #endif 1433 1434 if (__kmp_omp_cancellation) { 1435 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request); 1436 // Reset cancellation flag for worksharing constructs 1437 if (cancel_request == cancel_loop || 1438 cancel_request == cancel_sections) { 1439 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq); 1440 } 1441 } 1442 #if USE_ITT_BUILD 1443 /* TODO: In case of split reduction barrier, master thread may send 1444 acquired event early, before the final summation into the shared 1445 variable is done (final summation can be a long operation for array 1446 reductions). */ 1447 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1448 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1449 #endif /* USE_ITT_BUILD */ 1450 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1451 // Barrier - report frame end (only if active_level == 1) 1452 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 1453 __kmp_forkjoin_frames_mode && 1454 this_thr->th.th_teams_microtask == NULL && 1455 team->t.t_active_level == 1) { 1456 ident_t *loc = __kmp_threads[gtid]->th.th_ident; 1457 kmp_uint64 cur_time = __itt_get_timestamp(); 1458 kmp_info_t **other_threads = team->t.t_threads; 1459 int nproc = this_thr->th.th_team_nproc; 1460 int i; 1461 switch (__kmp_forkjoin_frames_mode) { 1462 case 1: 1463 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1464 loc, nproc); 1465 this_thr->th.th_frame_time = cur_time; 1466 break; 1467 case 2: // AC 2015-01-19: currently does not work for hierarchical (to 1468 // be fixed) 1469 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1470 1, loc, nproc); 1471 break; 1472 case 3: 1473 if (__itt_metadata_add_ptr) { 1474 // Initialize with master's wait time 1475 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1476 // Set arrive time to zero to be able to check it in 1477 // __kmp_invoke_task(); the same is done inside the loop below 1478 this_thr->th.th_bar_arrive_time = 0; 1479 for (i = 1; i < nproc; ++i) { 1480 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 1481 other_threads[i]->th.th_bar_arrive_time = 0; 1482 } 1483 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 1484 cur_time, delta, 1485 (kmp_uint64)(reduce != NULL)); 1486 } 1487 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1488 loc, nproc); 1489 this_thr->th.th_frame_time = cur_time; 1490 break; 1491 } 1492 } 1493 #endif /* USE_ITT_BUILD */ 1494 } else { 1495 status = 1; 1496 #if USE_ITT_BUILD 1497 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1498 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1499 #endif /* USE_ITT_BUILD */ 1500 } 1501 if ((status == 1 || !is_split) && !cancelled) { 1502 if (cancellable) { 1503 cancelled = __kmp_linear_barrier_release_cancellable( 1504 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1505 } else { 1506 switch (__kmp_barrier_release_pattern[bt]) { 1507 case bp_hyper_bar: { 1508 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1509 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 1510 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1511 break; 1512 } 1513 case bp_hierarchical_bar: { 1514 __kmp_hierarchical_barrier_release( 1515 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1516 break; 1517 } 1518 case bp_tree_bar: { 1519 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1520 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 1521 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1522 break; 1523 } 1524 default: { 1525 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 1526 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1527 } 1528 } 1529 } 1530 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { 1531 __kmp_task_team_sync(this_thr, team); 1532 } 1533 } 1534 1535 #if USE_ITT_BUILD 1536 /* GEH: TODO: Move this under if-condition above and also include in 1537 __kmp_end_split_barrier(). This will more accurately represent the actual 1538 release time of the threads for split barriers. */ 1539 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1540 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1541 #endif /* USE_ITT_BUILD */ 1542 } else { // Team is serialized. 1543 status = 0; 1544 if (__kmp_tasking_mode != tskm_immediate_exec) { 1545 if (this_thr->th.th_task_team != NULL) { 1546 #if USE_ITT_NOTIFY 1547 void *itt_sync_obj = NULL; 1548 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1549 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1550 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1551 } 1552 #endif 1553 1554 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == 1555 TRUE); 1556 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1557 __kmp_task_team_setup(this_thr, team, 0); 1558 1559 #if USE_ITT_BUILD 1560 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1561 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1562 #endif /* USE_ITT_BUILD */ 1563 } 1564 } 1565 } 1566 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n", 1567 gtid, __kmp_team_from_gtid(gtid)->t.t_id, 1568 __kmp_tid_from_gtid(gtid), status)); 1569 1570 #if OMPT_SUPPORT 1571 if (ompt_enabled.enabled) { 1572 #if OMPT_OPTIONAL 1573 if (ompt_enabled.ompt_callback_sync_region_wait) { 1574 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1575 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, 1576 return_address); 1577 } 1578 if (ompt_enabled.ompt_callback_sync_region) { 1579 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1580 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, 1581 return_address); 1582 } 1583 #endif 1584 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1585 } 1586 #endif 1587 ANNOTATE_BARRIER_END(&team->t.t_bar); 1588 1589 if (cancellable) 1590 return (int)cancelled; 1591 return status; 1592 } 1593 1594 // Returns 0 if master thread, 1 if worker thread. 1595 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split, 1596 size_t reduce_size, void *reduce_data, 1597 void (*reduce)(void *, void *)) { 1598 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data, 1599 reduce); 1600 } 1601 1602 #if defined(KMP_GOMP_COMPAT) 1603 // Returns 1 if cancelled, 0 otherwise 1604 int __kmp_barrier_gomp_cancel(int gtid) { 1605 if (__kmp_omp_cancellation) { 1606 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE, 1607 0, NULL, NULL); 1608 if (cancelled) { 1609 int tid = __kmp_tid_from_gtid(gtid); 1610 kmp_info_t *this_thr = __kmp_threads[gtid]; 1611 if (KMP_MASTER_TID(tid)) { 1612 // Master does not need to revert anything 1613 } else { 1614 // Workers need to revert their private b_arrived flag 1615 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -= 1616 KMP_BARRIER_STATE_BUMP; 1617 } 1618 } 1619 return cancelled; 1620 } 1621 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 1622 return FALSE; 1623 } 1624 #endif 1625 1626 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) { 1627 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier); 1628 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 1629 int tid = __kmp_tid_from_gtid(gtid); 1630 kmp_info_t *this_thr = __kmp_threads[gtid]; 1631 kmp_team_t *team = this_thr->th.th_team; 1632 1633 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); 1634 if (!team->t.t_serialized) { 1635 if (KMP_MASTER_GTID(gtid)) { 1636 switch (__kmp_barrier_release_pattern[bt]) { 1637 case bp_hyper_bar: { 1638 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1639 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 1640 FALSE USE_ITT_BUILD_ARG(NULL)); 1641 break; 1642 } 1643 case bp_hierarchical_bar: { 1644 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, 1645 FALSE USE_ITT_BUILD_ARG(NULL)); 1646 break; 1647 } 1648 case bp_tree_bar: { 1649 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1650 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 1651 FALSE USE_ITT_BUILD_ARG(NULL)); 1652 break; 1653 } 1654 default: { 1655 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 1656 FALSE USE_ITT_BUILD_ARG(NULL)); 1657 } 1658 } 1659 if (__kmp_tasking_mode != tskm_immediate_exec) { 1660 __kmp_task_team_sync(this_thr, team); 1661 } // if 1662 } 1663 } 1664 ANNOTATE_BARRIER_END(&team->t.t_bar); 1665 } 1666 1667 void __kmp_join_barrier(int gtid) { 1668 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier); 1669 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 1670 kmp_info_t *this_thr = __kmp_threads[gtid]; 1671 kmp_team_t *team; 1672 kmp_uint nproc; 1673 kmp_info_t *master_thread; 1674 int tid; 1675 #ifdef KMP_DEBUG 1676 int team_id; 1677 #endif /* KMP_DEBUG */ 1678 #if USE_ITT_BUILD 1679 void *itt_sync_obj = NULL; 1680 #if USE_ITT_NOTIFY 1681 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need 1682 // Get object created at fork_barrier 1683 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1684 #endif 1685 #endif /* USE_ITT_BUILD */ 1686 KMP_MB(); 1687 1688 // Get current info 1689 team = this_thr->th.th_team; 1690 nproc = this_thr->th.th_team_nproc; 1691 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc); 1692 tid = __kmp_tid_from_gtid(gtid); 1693 #ifdef KMP_DEBUG 1694 team_id = team->t.t_id; 1695 #endif /* KMP_DEBUG */ 1696 master_thread = this_thr->th.th_team_master; 1697 #ifdef KMP_DEBUG 1698 if (master_thread != team->t.t_threads[0]) { 1699 __kmp_print_structure(); 1700 } 1701 #endif /* KMP_DEBUG */ 1702 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); 1703 KMP_MB(); 1704 1705 // Verify state 1706 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 1707 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); 1708 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); 1709 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); 1710 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", 1711 gtid, team_id, tid)); 1712 1713 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); 1714 #if OMPT_SUPPORT 1715 if (ompt_enabled.enabled) { 1716 #if OMPT_OPTIONAL 1717 ompt_data_t *my_task_data; 1718 ompt_data_t *my_parallel_data; 1719 void *codeptr = NULL; 1720 int ds_tid = this_thr->th.th_info.ds.ds_tid; 1721 if (KMP_MASTER_TID(ds_tid) && 1722 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 1723 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 1724 codeptr = team->t.ompt_team_info.master_return_address; 1725 my_task_data = OMPT_CUR_TASK_DATA(this_thr); 1726 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); 1727 if (ompt_enabled.ompt_callback_sync_region) { 1728 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1729 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data, 1730 my_task_data, codeptr); 1731 } 1732 if (ompt_enabled.ompt_callback_sync_region_wait) { 1733 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1734 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data, 1735 my_task_data, codeptr); 1736 } 1737 if (!KMP_MASTER_TID(ds_tid)) 1738 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr); 1739 #endif 1740 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit; 1741 } 1742 #endif 1743 1744 if (__kmp_tasking_mode == tskm_extra_barrier) { 1745 __kmp_tasking_barrier(team, this_thr, gtid); 1746 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, 1747 team_id, tid)); 1748 } 1749 #ifdef KMP_DEBUG 1750 if (__kmp_tasking_mode != tskm_immediate_exec) { 1751 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = " 1752 "%p, th_task_team = %p\n", 1753 __kmp_gtid_from_thread(this_thr), team_id, 1754 team->t.t_task_team[this_thr->th.th_task_state], 1755 this_thr->th.th_task_team)); 1756 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == 1757 team->t.t_task_team[this_thr->th.th_task_state]); 1758 } 1759 #endif /* KMP_DEBUG */ 1760 1761 /* Copy the blocktime info to the thread, where __kmp_wait_template() can 1762 access it when the team struct is not guaranteed to exist. Doing these 1763 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround, 1764 we do not perform the copy if blocktime=infinite, since the values are not 1765 used by __kmp_wait_template() in that case. */ 1766 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1767 #if KMP_USE_MONITOR 1768 this_thr->th.th_team_bt_intervals = 1769 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1770 this_thr->th.th_team_bt_set = 1771 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1772 #else 1773 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 1774 #endif 1775 } 1776 1777 #if USE_ITT_BUILD 1778 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1779 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1780 #endif /* USE_ITT_BUILD */ 1781 1782 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { 1783 case bp_hyper_bar: { 1784 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 1785 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1786 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1787 break; 1788 } 1789 case bp_hierarchical_bar: { 1790 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1791 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1792 break; 1793 } 1794 case bp_tree_bar: { 1795 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 1796 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1797 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1798 break; 1799 } 1800 default: { 1801 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1802 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1803 } 1804 } 1805 1806 /* From this point on, the team data structure may be deallocated at any time 1807 by the master thread - it is unsafe to reference it in any of the worker 1808 threads. Any per-team data items that need to be referenced before the 1809 end of the barrier should be moved to the kmp_task_team_t structs. */ 1810 if (KMP_MASTER_TID(tid)) { 1811 if (__kmp_tasking_mode != tskm_immediate_exec) { 1812 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1813 } 1814 if (__kmp_display_affinity) { 1815 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0); 1816 } 1817 #if KMP_STATS_ENABLED 1818 // Have master thread flag the workers to indicate they are now waiting for 1819 // next parallel region, Also wake them up so they switch their timers to 1820 // idle. 1821 for (int i = 0; i < team->t.t_nproc; ++i) { 1822 kmp_info_t *team_thread = team->t.t_threads[i]; 1823 if (team_thread == this_thr) 1824 continue; 1825 team_thread->th.th_stats->setIdleFlag(); 1826 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && 1827 team_thread->th.th_sleep_loc != NULL) 1828 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), 1829 team_thread->th.th_sleep_loc); 1830 } 1831 #endif 1832 #if USE_ITT_BUILD 1833 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1834 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1835 #endif /* USE_ITT_BUILD */ 1836 1837 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1838 // Join barrier - report frame end 1839 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 1840 __kmp_forkjoin_frames_mode && this_thr->th.th_teams_microtask == NULL && 1841 team->t.t_active_level == 1) { 1842 kmp_uint64 cur_time = __itt_get_timestamp(); 1843 ident_t *loc = team->t.t_ident; 1844 kmp_info_t **other_threads = team->t.t_threads; 1845 int nproc = this_thr->th.th_team_nproc; 1846 int i; 1847 switch (__kmp_forkjoin_frames_mode) { 1848 case 1: 1849 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1850 loc, nproc); 1851 break; 1852 case 2: 1853 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, 1854 loc, nproc); 1855 break; 1856 case 3: 1857 if (__itt_metadata_add_ptr) { 1858 // Initialize with master's wait time 1859 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1860 // Set arrive time to zero to be able to check it in 1861 // __kmp_invoke_task(); the same is done inside the loop below 1862 this_thr->th.th_bar_arrive_time = 0; 1863 for (i = 1; i < nproc; ++i) { 1864 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 1865 other_threads[i]->th.th_bar_arrive_time = 0; 1866 } 1867 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 1868 cur_time, delta, 0); 1869 } 1870 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1871 loc, nproc); 1872 this_thr->th.th_frame_time = cur_time; 1873 break; 1874 } 1875 } 1876 #endif /* USE_ITT_BUILD */ 1877 } 1878 #if USE_ITT_BUILD 1879 else { 1880 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1881 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1882 } 1883 #endif /* USE_ITT_BUILD */ 1884 1885 #if KMP_DEBUG 1886 if (KMP_MASTER_TID(tid)) { 1887 KA_TRACE( 1888 15, 1889 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n", 1890 gtid, team_id, tid, nproc)); 1891 } 1892 #endif /* KMP_DEBUG */ 1893 1894 // TODO now, mark worker threads as done so they may be disbanded 1895 KMP_MB(); // Flush all pending memory write invalidates. 1896 KA_TRACE(10, 1897 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid)); 1898 1899 ANNOTATE_BARRIER_END(&team->t.t_bar); 1900 } 1901 1902 // TODO release worker threads' fork barriers as we are ready instead of all at 1903 // once 1904 void __kmp_fork_barrier(int gtid, int tid) { 1905 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier); 1906 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 1907 kmp_info_t *this_thr = __kmp_threads[gtid]; 1908 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; 1909 #if USE_ITT_BUILD 1910 void *itt_sync_obj = NULL; 1911 #endif /* USE_ITT_BUILD */ 1912 if (team) 1913 ANNOTATE_BARRIER_END(&team->t.t_bar); 1914 1915 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid, 1916 (team != NULL) ? team->t.t_id : -1, tid)); 1917 1918 // th_team pointer only valid for master thread here 1919 if (KMP_MASTER_TID(tid)) { 1920 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1921 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1922 // Create itt barrier object 1923 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1); 1924 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing 1925 } 1926 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1927 1928 #ifdef KMP_DEBUG 1929 kmp_info_t **other_threads = team->t.t_threads; 1930 int i; 1931 1932 // Verify state 1933 KMP_MB(); 1934 1935 for (i = 1; i < team->t.t_nproc; ++i) { 1936 KA_TRACE(500, 1937 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go " 1938 "== %u.\n", 1939 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, 1940 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, 1941 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); 1942 KMP_DEBUG_ASSERT( 1943 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) & 1944 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE); 1945 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); 1946 } 1947 #endif 1948 1949 if (__kmp_tasking_mode != tskm_immediate_exec) { 1950 // 0 indicates setup current task team if nthreads > 1 1951 __kmp_task_team_setup(this_thr, team, 0); 1952 } 1953 1954 /* The master thread may have changed its blocktime between the join barrier 1955 and the fork barrier. Copy the blocktime info to the thread, where 1956 __kmp_wait_template() can access it when the team struct is not 1957 guaranteed to exist. */ 1958 // See note about the corresponding code in __kmp_join_barrier() being 1959 // performance-critical 1960 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1961 #if KMP_USE_MONITOR 1962 this_thr->th.th_team_bt_intervals = 1963 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1964 this_thr->th.th_team_bt_set = 1965 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1966 #else 1967 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 1968 #endif 1969 } 1970 } // master 1971 1972 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { 1973 case bp_hyper_bar: { 1974 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 1975 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1976 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1977 break; 1978 } 1979 case bp_hierarchical_bar: { 1980 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1981 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1982 break; 1983 } 1984 case bp_tree_bar: { 1985 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 1986 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1987 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1988 break; 1989 } 1990 default: { 1991 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1992 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1993 } 1994 } 1995 1996 #if OMPT_SUPPORT 1997 if (ompt_enabled.enabled && 1998 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 1999 int ds_tid = this_thr->th.th_info.ds.ds_tid; 2000 ompt_data_t *task_data = (team) 2001 ? OMPT_CUR_TASK_DATA(this_thr) 2002 : &(this_thr->th.ompt_thread_info.task_data); 2003 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 2004 #if OMPT_OPTIONAL 2005 void *codeptr = NULL; 2006 if (KMP_MASTER_TID(ds_tid) && 2007 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 2008 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 2009 codeptr = team->t.ompt_team_info.master_return_address; 2010 if (ompt_enabled.ompt_callback_sync_region_wait) { 2011 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2012 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 2013 codeptr); 2014 } 2015 if (ompt_enabled.ompt_callback_sync_region) { 2016 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2017 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 2018 codeptr); 2019 } 2020 #endif 2021 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 2022 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2023 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial? 2024 } 2025 } 2026 #endif 2027 2028 // Early exit for reaping threads releasing forkjoin barrier 2029 if (TCR_4(__kmp_global.g.g_done)) { 2030 this_thr->th.th_task_team = NULL; 2031 2032 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2033 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2034 if (!KMP_MASTER_TID(tid)) { 2035 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2036 if (itt_sync_obj) 2037 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 2038 } 2039 } 2040 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2041 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid)); 2042 return; 2043 } 2044 2045 /* We can now assume that a valid team structure has been allocated by the 2046 master and propagated to all worker threads. The current thread, however, 2047 may not be part of the team, so we can't blindly assume that the team 2048 pointer is non-null. */ 2049 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); 2050 KMP_DEBUG_ASSERT(team != NULL); 2051 tid = __kmp_tid_from_gtid(gtid); 2052 2053 #if KMP_BARRIER_ICV_PULL 2054 /* Master thread's copy of the ICVs was set up on the implicit taskdata in 2055 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's 2056 implicit task has this data before this function is called. We cannot 2057 modify __kmp_fork_call() to look at the fixed ICVs in the master's thread 2058 struct, because it is not always the case that the threads arrays have 2059 been allocated when __kmp_fork_call() is executed. */ 2060 { 2061 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 2062 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs 2063 // Copy the initial ICVs from the master's thread struct to the implicit 2064 // task for this tid. 2065 KA_TRACE(10, 2066 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); 2067 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, 2068 tid, FALSE); 2069 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 2070 &team->t.t_threads[0] 2071 ->th.th_bar[bs_forkjoin_barrier] 2072 .bb.th_fixed_icvs); 2073 } 2074 } 2075 #endif // KMP_BARRIER_ICV_PULL 2076 2077 if (__kmp_tasking_mode != tskm_immediate_exec) { 2078 __kmp_task_team_sync(this_thr, team); 2079 } 2080 2081 #if KMP_AFFINITY_SUPPORTED 2082 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 2083 if (proc_bind == proc_bind_intel) { 2084 // Call dynamic affinity settings 2085 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) { 2086 __kmp_balanced_affinity(this_thr, team->t.t_nproc); 2087 } 2088 } else if (proc_bind != proc_bind_false) { 2089 if (this_thr->th.th_new_place == this_thr->th.th_current_place) { 2090 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n", 2091 __kmp_gtid_from_thread(this_thr), 2092 this_thr->th.th_current_place)); 2093 } else { 2094 __kmp_affinity_set_place(gtid); 2095 } 2096 } 2097 #endif // KMP_AFFINITY_SUPPORTED 2098 // Perform the display affinity functionality 2099 if (__kmp_display_affinity) { 2100 if (team->t.t_display_affinity 2101 #if KMP_AFFINITY_SUPPORTED 2102 || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) 2103 #endif 2104 ) { 2105 // NULL means use the affinity-format-var ICV 2106 __kmp_aux_display_affinity(gtid, NULL); 2107 this_thr->th.th_prev_num_threads = team->t.t_nproc; 2108 this_thr->th.th_prev_level = team->t.t_level; 2109 } 2110 } 2111 if (!KMP_MASTER_TID(tid)) 2112 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator); 2113 2114 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2115 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2116 if (!KMP_MASTER_TID(tid)) { 2117 // Get correct barrier object 2118 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2119 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired 2120 } // (prepare called inside barrier_release) 2121 } 2122 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2123 ANNOTATE_BARRIER_END(&team->t.t_bar); 2124 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, 2125 team->t.t_id, tid)); 2126 } 2127 2128 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 2129 kmp_internal_control_t *new_icvs, ident_t *loc) { 2130 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy); 2131 2132 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); 2133 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 2134 2135 /* Master thread's copy of the ICVs was set up on the implicit taskdata in 2136 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's 2137 implicit task has this data before this function is called. */ 2138 #if KMP_BARRIER_ICV_PULL 2139 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains 2140 untouched), where all of the worker threads can access them and make their 2141 own copies after the barrier. */ 2142 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 2143 // allocated at this point 2144 copy_icvs( 2145 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, 2146 new_icvs); 2147 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0, 2148 team->t.t_threads[0], team)); 2149 #elif KMP_BARRIER_ICV_PUSH 2150 // The ICVs will be propagated in the fork barrier, so nothing needs to be 2151 // done here. 2152 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0, 2153 team->t.t_threads[0], team)); 2154 #else 2155 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) 2156 // time. 2157 ngo_load(new_icvs); 2158 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 2159 // allocated at this point 2160 for (int f = 1; f < new_nproc; ++f) { // Skip the master thread 2161 // TODO: GEH - pass in better source location info since usually NULL here 2162 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 2163 f, team->t.t_threads[f], team)); 2164 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); 2165 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); 2166 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 2167 f, team->t.t_threads[f], team)); 2168 } 2169 ngo_sync(); 2170 #endif // KMP_BARRIER_ICV_PULL 2171 } 2172