1 /* 2 * kmp_barrier.cpp 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_wait_release.h" 15 #include "kmp_itt.h" 16 #include "kmp_os.h" 17 #include "kmp_stats.h" 18 #include "ompt-specific.h" 19 20 #if KMP_MIC 21 #include <immintrin.h> 22 #define USE_NGO_STORES 1 23 #endif // KMP_MIC 24 25 #if KMP_MIC && USE_NGO_STORES 26 // ICV copying 27 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) 28 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 29 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 30 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory") 31 #else 32 #define ngo_load(src) ((void)0) 33 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) 34 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) 35 #define ngo_sync() ((void)0) 36 #endif /* KMP_MIC && USE_NGO_STORES */ 37 38 void __kmp_print_structure(void); // Forward declaration 39 40 // ---------------------------- Barrier Algorithms ---------------------------- 41 42 // Linear Barrier 43 template <bool cancellable = false> 44 static bool __kmp_linear_barrier_gather_template( 45 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 46 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 47 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather); 48 kmp_team_t *team = this_thr->th.th_team; 49 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 50 kmp_info_t **other_threads = team->t.t_threads; 51 52 KA_TRACE( 53 20, 54 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 55 gtid, team->t.t_id, tid, bt)); 56 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 57 58 #if USE_ITT_BUILD && USE_ITT_NOTIFY 59 // Barrier imbalance - save arrive time to the thread 60 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 61 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 62 __itt_get_timestamp(); 63 } 64 #endif 65 // We now perform a linear reduction to signal that all of the threads have 66 // arrived. 67 if (!KMP_MASTER_TID(tid)) { 68 KA_TRACE(20, 69 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" 70 "arrived(%p): %llu => %llu\n", 71 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team), 72 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived, 73 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 74 // Mark arrival to primary thread 75 /* After performing this write, a worker thread may not assume that the team 76 is valid any more - it could be deallocated by the primary thread at any 77 time. */ 78 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]); 79 flag.release(); 80 } else { 81 kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; 82 int nproc = this_thr->th.th_team_nproc; 83 int i; 84 // Don't have to worry about sleep bit here or atomic since team setting 85 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; 86 87 // Collect all the worker team member threads. 88 for (i = 1; i < nproc; ++i) { 89 #if KMP_CACHE_MANAGE 90 // Prefetch next thread's arrived count 91 if (i + 1 < nproc) 92 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived); 93 #endif /* KMP_CACHE_MANAGE */ 94 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 95 "arrived(%p) == %llu\n", 96 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 97 team->t.t_id, i, 98 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); 99 100 // Wait for worker thread to arrive 101 if (cancellable) { 102 kmp_flag_64<true, false> flag( 103 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state); 104 if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj))) 105 return true; 106 } else { 107 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, 108 new_state); 109 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 110 } 111 #if USE_ITT_BUILD && USE_ITT_NOTIFY 112 // Barrier imbalance - write min of the thread time and the other thread 113 // time to the thread. 114 if (__kmp_forkjoin_frames_mode == 2) { 115 this_thr->th.th_bar_min_time = KMP_MIN( 116 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time); 117 } 118 #endif 119 if (reduce) { 120 KA_TRACE(100, 121 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 122 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 123 team->t.t_id, i)); 124 OMPT_REDUCTION_DECL(this_thr, gtid); 125 OMPT_REDUCTION_BEGIN; 126 (*reduce)(this_thr->th.th_local.reduce_data, 127 other_threads[i]->th.th_local.reduce_data); 128 OMPT_REDUCTION_END; 129 } 130 } 131 // Don't have to worry about sleep bit here or atomic since team setting 132 team_bar->b_arrived = new_state; 133 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d " 134 "arrived(%p) = %llu\n", 135 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, 136 new_state)); 137 } 138 KA_TRACE( 139 20, 140 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 141 gtid, team->t.t_id, tid, bt)); 142 return false; 143 } 144 145 template <bool cancellable = false> 146 static bool __kmp_linear_barrier_release_template( 147 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 148 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 149 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release); 150 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 151 kmp_team_t *team; 152 153 if (KMP_MASTER_TID(tid)) { 154 unsigned int i; 155 kmp_uint32 nproc = this_thr->th.th_team_nproc; 156 kmp_info_t **other_threads; 157 158 team = __kmp_threads[gtid]->th.th_team; 159 KMP_DEBUG_ASSERT(team != NULL); 160 other_threads = team->t.t_threads; 161 162 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for " 163 "barrier type %d\n", 164 gtid, team->t.t_id, tid, bt)); 165 166 if (nproc > 1) { 167 #if KMP_BARRIER_ICV_PUSH 168 { 169 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 170 if (propagate_icvs) { 171 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); 172 for (i = 1; i < nproc; ++i) { 173 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], 174 team, i, FALSE); 175 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, 176 &team->t.t_implicit_task_taskdata[0].td_icvs); 177 } 178 ngo_sync(); 179 } 180 } 181 #endif // KMP_BARRIER_ICV_PUSH 182 183 // Now, release all of the worker threads 184 for (i = 1; i < nproc; ++i) { 185 #if KMP_CACHE_MANAGE 186 // Prefetch next thread's go flag 187 if (i + 1 < nproc) 188 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go); 189 #endif /* KMP_CACHE_MANAGE */ 190 KA_TRACE( 191 20, 192 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " 193 "go(%p): %u => %u\n", 194 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid, 195 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go, 196 other_threads[i]->th.th_bar[bt].bb.b_go, 197 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); 198 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go, 199 other_threads[i]); 200 flag.release(); 201 } 202 } 203 } else { // Wait for the PRIMARY thread to release us 204 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", 205 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 206 if (cancellable) { 207 kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 208 if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj))) 209 return true; 210 } else { 211 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 212 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 213 } 214 #if USE_ITT_BUILD && USE_ITT_NOTIFY 215 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 216 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is 217 // disabled) 218 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 219 // Cancel wait on previous parallel region... 220 __kmp_itt_task_starting(itt_sync_obj); 221 222 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 223 return false; 224 225 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 226 if (itt_sync_obj != NULL) 227 // Call prepare as early as possible for "new" barrier 228 __kmp_itt_task_finished(itt_sync_obj); 229 } else 230 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 231 // Early exit for reaping threads releasing forkjoin barrier 232 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 233 return false; 234 // The worker thread may now assume that the team is valid. 235 #ifdef KMP_DEBUG 236 tid = __kmp_tid_from_gtid(gtid); 237 team = __kmp_threads[gtid]->th.th_team; 238 #endif 239 KMP_DEBUG_ASSERT(team != NULL); 240 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 241 KA_TRACE(20, 242 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 243 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 244 KMP_MB(); // Flush all pending memory write invalidates. 245 } 246 KA_TRACE( 247 20, 248 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 249 gtid, team->t.t_id, tid, bt)); 250 return false; 251 } 252 253 static void __kmp_linear_barrier_gather( 254 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 255 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 256 __kmp_linear_barrier_gather_template<false>( 257 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 258 } 259 260 static bool __kmp_linear_barrier_gather_cancellable( 261 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 262 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 263 return __kmp_linear_barrier_gather_template<true>( 264 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 265 } 266 267 static void __kmp_linear_barrier_release( 268 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 269 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 270 __kmp_linear_barrier_release_template<false>( 271 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); 272 } 273 274 static bool __kmp_linear_barrier_release_cancellable( 275 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 276 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 277 return __kmp_linear_barrier_release_template<true>( 278 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); 279 } 280 281 // Tree barrier 282 static void __kmp_tree_barrier_gather( 283 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 284 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 285 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather); 286 kmp_team_t *team = this_thr->th.th_team; 287 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 288 kmp_info_t **other_threads = team->t.t_threads; 289 kmp_uint32 nproc = this_thr->th.th_team_nproc; 290 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 291 kmp_uint32 branch_factor = 1 << branch_bits; 292 kmp_uint32 child; 293 kmp_uint32 child_tid; 294 kmp_uint64 new_state = 0; 295 296 KA_TRACE( 297 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 298 gtid, team->t.t_id, tid, bt)); 299 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 300 301 #if USE_ITT_BUILD && USE_ITT_NOTIFY 302 // Barrier imbalance - save arrive time to the thread 303 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 304 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 305 __itt_get_timestamp(); 306 } 307 #endif 308 // Perform tree gather to wait until all threads have arrived; reduce any 309 // required data as we go 310 child_tid = (tid << branch_bits) + 1; 311 if (child_tid < nproc) { 312 // Parent threads wait for all their children to arrive 313 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 314 child = 1; 315 do { 316 kmp_info_t *child_thr = other_threads[child_tid]; 317 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 318 #if KMP_CACHE_MANAGE 319 // Prefetch next thread's arrived count 320 if (child + 1 <= branch_factor && child_tid + 1 < nproc) 321 KMP_CACHE_PREFETCH( 322 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived); 323 #endif /* KMP_CACHE_MANAGE */ 324 KA_TRACE(20, 325 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 326 "arrived(%p) == %llu\n", 327 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 328 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 329 // Wait for child to arrive 330 kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 331 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 332 #if USE_ITT_BUILD && USE_ITT_NOTIFY 333 // Barrier imbalance - write min of the thread time and a child time to 334 // the thread. 335 if (__kmp_forkjoin_frames_mode == 2) { 336 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 337 child_thr->th.th_bar_min_time); 338 } 339 #endif 340 if (reduce) { 341 KA_TRACE(100, 342 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 343 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 344 team->t.t_id, child_tid)); 345 OMPT_REDUCTION_DECL(this_thr, gtid); 346 OMPT_REDUCTION_BEGIN; 347 (*reduce)(this_thr->th.th_local.reduce_data, 348 child_thr->th.th_local.reduce_data); 349 OMPT_REDUCTION_END; 350 } 351 child++; 352 child_tid++; 353 } while (child <= branch_factor && child_tid < nproc); 354 } 355 356 if (!KMP_MASTER_TID(tid)) { // Worker threads 357 kmp_int32 parent_tid = (tid - 1) >> branch_bits; 358 359 KA_TRACE(20, 360 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 361 "arrived(%p): %llu => %llu\n", 362 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 363 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 364 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 365 366 // Mark arrival to parent thread 367 /* After performing this write, a worker thread may not assume that the team 368 is valid any more - it could be deallocated by the primary thread at any 369 time. */ 370 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]); 371 flag.release(); 372 } else { 373 // Need to update the team arrived pointer if we are the primary thread 374 if (nproc > 1) // New value was already computed above 375 team->t.t_bar[bt].b_arrived = new_state; 376 else 377 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 378 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d " 379 "arrived(%p) = %llu\n", 380 gtid, team->t.t_id, tid, team->t.t_id, 381 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 382 } 383 KA_TRACE(20, 384 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 385 gtid, team->t.t_id, tid, bt)); 386 } 387 388 static void __kmp_tree_barrier_release( 389 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 390 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 391 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release); 392 kmp_team_t *team; 393 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 394 kmp_uint32 nproc; 395 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 396 kmp_uint32 branch_factor = 1 << branch_bits; 397 kmp_uint32 child; 398 kmp_uint32 child_tid; 399 400 // Perform a tree release for all of the threads that have been gathered 401 if (!KMP_MASTER_TID( 402 tid)) { // Handle fork barrier workers who aren't part of a team yet 403 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid, 404 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 405 // Wait for parent thread to release us 406 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 407 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 408 #if USE_ITT_BUILD && USE_ITT_NOTIFY 409 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 410 // In fork barrier where we could not get the object reliably (or 411 // ITTNOTIFY is disabled) 412 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 413 // Cancel wait on previous parallel region... 414 __kmp_itt_task_starting(itt_sync_obj); 415 416 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 417 return; 418 419 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 420 if (itt_sync_obj != NULL) 421 // Call prepare as early as possible for "new" barrier 422 __kmp_itt_task_finished(itt_sync_obj); 423 } else 424 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 425 // Early exit for reaping threads releasing forkjoin barrier 426 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 427 return; 428 429 // The worker thread may now assume that the team is valid. 430 team = __kmp_threads[gtid]->th.th_team; 431 KMP_DEBUG_ASSERT(team != NULL); 432 tid = __kmp_tid_from_gtid(gtid); 433 434 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 435 KA_TRACE(20, 436 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid, 437 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 438 KMP_MB(); // Flush all pending memory write invalidates. 439 } else { 440 team = __kmp_threads[gtid]->th.th_team; 441 KMP_DEBUG_ASSERT(team != NULL); 442 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for " 443 "barrier type %d\n", 444 gtid, team->t.t_id, tid, bt)); 445 } 446 nproc = this_thr->th.th_team_nproc; 447 child_tid = (tid << branch_bits) + 1; 448 449 if (child_tid < nproc) { 450 kmp_info_t **other_threads = team->t.t_threads; 451 child = 1; 452 // Parent threads release all their children 453 do { 454 kmp_info_t *child_thr = other_threads[child_tid]; 455 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 456 #if KMP_CACHE_MANAGE 457 // Prefetch next thread's go count 458 if (child + 1 <= branch_factor && child_tid + 1 < nproc) 459 KMP_CACHE_PREFETCH( 460 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go); 461 #endif /* KMP_CACHE_MANAGE */ 462 463 #if KMP_BARRIER_ICV_PUSH 464 { 465 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 466 if (propagate_icvs) { 467 __kmp_init_implicit_task(team->t.t_ident, 468 team->t.t_threads[child_tid], team, 469 child_tid, FALSE); 470 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, 471 &team->t.t_implicit_task_taskdata[0].td_icvs); 472 } 473 } 474 #endif // KMP_BARRIER_ICV_PUSH 475 KA_TRACE(20, 476 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 477 "go(%p): %u => %u\n", 478 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 479 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 480 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 481 // Release child from barrier 482 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 483 flag.release(); 484 child++; 485 child_tid++; 486 } while (child <= branch_factor && child_tid < nproc); 487 } 488 KA_TRACE( 489 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 490 gtid, team->t.t_id, tid, bt)); 491 } 492 493 // Hyper Barrier 494 static void __kmp_hyper_barrier_gather( 495 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 496 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 497 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather); 498 kmp_team_t *team = this_thr->th.th_team; 499 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 500 kmp_info_t **other_threads = team->t.t_threads; 501 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE; 502 kmp_uint32 num_threads = this_thr->th.th_team_nproc; 503 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 504 kmp_uint32 branch_factor = 1 << branch_bits; 505 kmp_uint32 offset; 506 kmp_uint32 level; 507 508 KA_TRACE( 509 20, 510 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 511 gtid, team->t.t_id, tid, bt)); 512 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 513 514 #if USE_ITT_BUILD && USE_ITT_NOTIFY 515 // Barrier imbalance - save arrive time to the thread 516 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 517 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 518 __itt_get_timestamp(); 519 } 520 #endif 521 /* Perform a hypercube-embedded tree gather to wait until all of the threads 522 have arrived, and reduce any required data as we go. */ 523 kmp_flag_64<> p_flag(&thr_bar->b_arrived); 524 for (level = 0, offset = 1; offset < num_threads; 525 level += branch_bits, offset <<= branch_bits) { 526 kmp_uint32 child; 527 kmp_uint32 child_tid; 528 529 if (((tid >> level) & (branch_factor - 1)) != 0) { 530 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1); 531 532 KMP_MB(); // Synchronize parent and child threads. 533 KA_TRACE(20, 534 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 535 "arrived(%p): %llu => %llu\n", 536 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 537 team->t.t_id, parent_tid, &thr_bar->b_arrived, 538 thr_bar->b_arrived, 539 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 540 // Mark arrival to parent thread 541 /* After performing this write (in the last iteration of the enclosing for 542 loop), a worker thread may not assume that the team is valid any more 543 - it could be deallocated by the primary thread at any time. */ 544 p_flag.set_waiter(other_threads[parent_tid]); 545 p_flag.release(); 546 break; 547 } 548 549 // Parent threads wait for children to arrive 550 if (new_state == KMP_BARRIER_UNUSED_STATE) 551 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 552 for (child = 1, child_tid = tid + (1 << level); 553 child < branch_factor && child_tid < num_threads; 554 child++, child_tid += (1 << level)) { 555 kmp_info_t *child_thr = other_threads[child_tid]; 556 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 557 #if KMP_CACHE_MANAGE 558 kmp_uint32 next_child_tid = child_tid + (1 << level); 559 // Prefetch next thread's arrived count 560 if (child + 1 < branch_factor && next_child_tid < num_threads) 561 KMP_CACHE_PREFETCH( 562 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); 563 #endif /* KMP_CACHE_MANAGE */ 564 KA_TRACE(20, 565 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 566 "arrived(%p) == %llu\n", 567 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 568 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 569 // Wait for child to arrive 570 kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state); 571 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 572 KMP_MB(); // Synchronize parent and child threads. 573 #if USE_ITT_BUILD && USE_ITT_NOTIFY 574 // Barrier imbalance - write min of the thread time and a child time to 575 // the thread. 576 if (__kmp_forkjoin_frames_mode == 2) { 577 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 578 child_thr->th.th_bar_min_time); 579 } 580 #endif 581 if (reduce) { 582 KA_TRACE(100, 583 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 584 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 585 team->t.t_id, child_tid)); 586 OMPT_REDUCTION_DECL(this_thr, gtid); 587 OMPT_REDUCTION_BEGIN; 588 (*reduce)(this_thr->th.th_local.reduce_data, 589 child_thr->th.th_local.reduce_data); 590 OMPT_REDUCTION_END; 591 } 592 } 593 } 594 595 if (KMP_MASTER_TID(tid)) { 596 // Need to update the team arrived pointer if we are the primary thread 597 if (new_state == KMP_BARRIER_UNUSED_STATE) 598 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 599 else 600 team->t.t_bar[bt].b_arrived = new_state; 601 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d " 602 "arrived(%p) = %llu\n", 603 gtid, team->t.t_id, tid, team->t.t_id, 604 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 605 } 606 KA_TRACE( 607 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 608 gtid, team->t.t_id, tid, bt)); 609 } 610 611 // The reverse versions seem to beat the forward versions overall 612 #define KMP_REVERSE_HYPER_BAR 613 static void __kmp_hyper_barrier_release( 614 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 615 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 616 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release); 617 kmp_team_t *team; 618 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 619 kmp_info_t **other_threads; 620 kmp_uint32 num_threads; 621 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 622 kmp_uint32 branch_factor = 1 << branch_bits; 623 kmp_uint32 child; 624 kmp_uint32 child_tid; 625 kmp_uint32 offset; 626 kmp_uint32 level; 627 628 /* Perform a hypercube-embedded tree release for all of the threads that have 629 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads 630 are released in the reverse order of the corresponding gather, otherwise 631 threads are released in the same order. */ 632 if (KMP_MASTER_TID(tid)) { // primary thread 633 team = __kmp_threads[gtid]->th.th_team; 634 KMP_DEBUG_ASSERT(team != NULL); 635 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for " 636 "barrier type %d\n", 637 gtid, team->t.t_id, tid, bt)); 638 #if KMP_BARRIER_ICV_PUSH 639 if (propagate_icvs) { // primary already has ICVs in final destination; copy 640 copy_icvs(&thr_bar->th_fixed_icvs, 641 &team->t.t_implicit_task_taskdata[tid].td_icvs); 642 } 643 #endif 644 } else { // Handle fork barrier workers who aren't part of a team yet 645 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid, 646 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 647 // Wait for parent thread to release us 648 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 649 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 650 #if USE_ITT_BUILD && USE_ITT_NOTIFY 651 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 652 // In fork barrier where we could not get the object reliably 653 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 654 // Cancel wait on previous parallel region... 655 __kmp_itt_task_starting(itt_sync_obj); 656 657 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 658 return; 659 660 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 661 if (itt_sync_obj != NULL) 662 // Call prepare as early as possible for "new" barrier 663 __kmp_itt_task_finished(itt_sync_obj); 664 } else 665 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 666 // Early exit for reaping threads releasing forkjoin barrier 667 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 668 return; 669 670 // The worker thread may now assume that the team is valid. 671 team = __kmp_threads[gtid]->th.th_team; 672 KMP_DEBUG_ASSERT(team != NULL); 673 tid = __kmp_tid_from_gtid(gtid); 674 675 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 676 KA_TRACE(20, 677 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 678 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 679 KMP_MB(); // Flush all pending memory write invalidates. 680 } 681 num_threads = this_thr->th.th_team_nproc; 682 other_threads = team->t.t_threads; 683 684 #ifdef KMP_REVERSE_HYPER_BAR 685 // Count up to correct level for parent 686 for (level = 0, offset = 1; 687 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0); 688 level += branch_bits, offset <<= branch_bits) 689 ; 690 691 // Now go down from there 692 for (level -= branch_bits, offset >>= branch_bits; offset != 0; 693 level -= branch_bits, offset >>= branch_bits) 694 #else 695 // Go down the tree, level by level 696 for (level = 0, offset = 1; offset < num_threads; 697 level += branch_bits, offset <<= branch_bits) 698 #endif // KMP_REVERSE_HYPER_BAR 699 { 700 #ifdef KMP_REVERSE_HYPER_BAR 701 /* Now go in reverse order through the children, highest to lowest. 702 Initial setting of child is conservative here. */ 703 child = num_threads >> ((level == 0) ? level : level - 1); 704 for (child = (child < branch_factor - 1) ? child : branch_factor - 1, 705 child_tid = tid + (child << level); 706 child >= 1; child--, child_tid -= (1 << level)) 707 #else 708 if (((tid >> level) & (branch_factor - 1)) != 0) 709 // No need to go lower than this, since this is the level parent would be 710 // notified 711 break; 712 // Iterate through children on this level of the tree 713 for (child = 1, child_tid = tid + (1 << level); 714 child < branch_factor && child_tid < num_threads; 715 child++, child_tid += (1 << level)) 716 #endif // KMP_REVERSE_HYPER_BAR 717 { 718 if (child_tid >= num_threads) 719 continue; // Child doesn't exist so keep going 720 else { 721 kmp_info_t *child_thr = other_threads[child_tid]; 722 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 723 #if KMP_CACHE_MANAGE 724 kmp_uint32 next_child_tid = child_tid - (1 << level); 725 // Prefetch next thread's go count 726 #ifdef KMP_REVERSE_HYPER_BAR 727 if (child - 1 >= 1 && next_child_tid < num_threads) 728 #else 729 if (child + 1 < branch_factor && next_child_tid < num_threads) 730 #endif // KMP_REVERSE_HYPER_BAR 731 KMP_CACHE_PREFETCH( 732 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); 733 #endif /* KMP_CACHE_MANAGE */ 734 735 #if KMP_BARRIER_ICV_PUSH 736 if (propagate_icvs) // push my fixed ICVs to my child 737 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 738 #endif // KMP_BARRIER_ICV_PUSH 739 740 KA_TRACE( 741 20, 742 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 743 "go(%p): %u => %u\n", 744 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 745 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 746 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 747 // Release child from barrier 748 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 749 flag.release(); 750 } 751 } 752 } 753 #if KMP_BARRIER_ICV_PUSH 754 if (propagate_icvs && 755 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest 756 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 757 FALSE); 758 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 759 &thr_bar->th_fixed_icvs); 760 } 761 #endif 762 KA_TRACE( 763 20, 764 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 765 gtid, team->t.t_id, tid, bt)); 766 } 767 768 // Hierarchical Barrier 769 770 // Initialize thread barrier data 771 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. 772 Performs the minimum amount of initialization required based on how the team 773 has changed. Returns true if leaf children will require both on-core and 774 traditional wake-up mechanisms. For example, if the team size increases, 775 threads already in the team will respond to on-core wakeup on their parent 776 thread, but threads newly added to the team will only be listening on the 777 their local b_go. */ 778 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, 779 kmp_bstate_t *thr_bar, 780 kmp_uint32 nproc, int gtid, 781 int tid, kmp_team_t *team) { 782 // Checks to determine if (re-)initialization is needed 783 bool uninitialized = thr_bar->team == NULL; 784 bool team_changed = team != thr_bar->team; 785 bool team_sz_changed = nproc != thr_bar->nproc; 786 bool tid_changed = tid != thr_bar->old_tid; 787 bool retval = false; 788 789 if (uninitialized || team_sz_changed) { 790 __kmp_get_hierarchy(nproc, thr_bar); 791 } 792 793 if (uninitialized || team_sz_changed || tid_changed) { 794 thr_bar->my_level = thr_bar->depth - 1; // default for primary thread 795 thr_bar->parent_tid = -1; // default for primary thread 796 if (!KMP_MASTER_TID(tid)) { 797 // if not primary thread, find parent thread in hierarchy 798 kmp_uint32 d = 0; 799 while (d < thr_bar->depth) { // find parent based on level of thread in 800 // hierarchy, and note level 801 kmp_uint32 rem; 802 if (d == thr_bar->depth - 2) { // reached level right below the primary 803 thr_bar->parent_tid = 0; 804 thr_bar->my_level = d; 805 break; 806 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) { 807 // TODO: can we make the above op faster? 808 // thread is not a subtree root at next level, so this is max 809 thr_bar->parent_tid = tid - rem; 810 thr_bar->my_level = d; 811 break; 812 } 813 ++d; 814 } 815 } 816 __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) / 817 (thr_bar->skip_per_level[thr_bar->my_level])), 818 &(thr_bar->offset)); 819 thr_bar->old_tid = tid; 820 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 821 thr_bar->team = team; 822 thr_bar->parent_bar = 823 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 824 } 825 if (uninitialized || team_changed || tid_changed) { 826 thr_bar->team = team; 827 thr_bar->parent_bar = 828 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 829 retval = true; 830 } 831 if (uninitialized || team_sz_changed || tid_changed) { 832 thr_bar->nproc = nproc; 833 thr_bar->leaf_kids = thr_bar->base_leaf_kids; 834 if (thr_bar->my_level == 0) 835 thr_bar->leaf_kids = 0; 836 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc) 837 __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids)); 838 thr_bar->leaf_state = 0; 839 for (int i = 0; i < thr_bar->leaf_kids; ++i) 840 ((char *)&(thr_bar->leaf_state))[7 - i] = 1; 841 } 842 return retval; 843 } 844 845 static void __kmp_hierarchical_barrier_gather( 846 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 847 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 848 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather); 849 kmp_team_t *team = this_thr->th.th_team; 850 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 851 kmp_uint32 nproc = this_thr->th.th_team_nproc; 852 kmp_info_t **other_threads = team->t.t_threads; 853 kmp_uint64 new_state = 0; 854 855 int level = team->t.t_level; 856 if (other_threads[0] 857 ->th.th_teams_microtask) // are we inside the teams construct? 858 if (this_thr->th.th_teams_size.nteams > 1) 859 ++level; // level was not increased in teams construct for team_of_masters 860 if (level == 1) 861 thr_bar->use_oncore_barrier = 1; 862 else 863 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 864 865 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for " 866 "barrier type %d\n", 867 gtid, team->t.t_id, tid, bt)); 868 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 869 870 #if USE_ITT_BUILD && USE_ITT_NOTIFY 871 // Barrier imbalance - save arrive time to the thread 872 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 873 this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); 874 } 875 #endif 876 877 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, 878 team); 879 880 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) 881 kmp_int32 child_tid; 882 new_state = 883 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 884 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 885 thr_bar->use_oncore_barrier) { 886 if (thr_bar->leaf_kids) { 887 // First, wait for leaf children to check-in on my b_arrived flag 888 kmp_uint64 leaf_state = 889 KMP_MASTER_TID(tid) 890 ? thr_bar->b_arrived | thr_bar->leaf_state 891 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; 892 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting " 893 "for leaf kids\n", 894 gtid, team->t.t_id, tid)); 895 kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state); 896 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 897 if (reduce) { 898 OMPT_REDUCTION_DECL(this_thr, gtid); 899 OMPT_REDUCTION_BEGIN; 900 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids; 901 ++child_tid) { 902 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 903 "T#%d(%d:%d)\n", 904 gtid, team->t.t_id, tid, 905 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 906 child_tid)); 907 (*reduce)(this_thr->th.th_local.reduce_data, 908 other_threads[child_tid]->th.th_local.reduce_data); 909 } 910 OMPT_REDUCTION_END; 911 } 912 // clear leaf_state bits 913 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state)); 914 } 915 // Next, wait for higher level children on each child's b_arrived flag 916 for (kmp_uint32 d = 1; d < thr_bar->my_level; 917 ++d) { // gather lowest level threads first, but skip 0 918 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 919 skip = thr_bar->skip_per_level[d]; 920 if (last > nproc) 921 last = nproc; 922 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 923 kmp_info_t *child_thr = other_threads[child_tid]; 924 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 925 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 926 "T#%d(%d:%d) " 927 "arrived(%p) == %llu\n", 928 gtid, team->t.t_id, tid, 929 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 930 child_tid, &child_bar->b_arrived, new_state)); 931 kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 932 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 933 if (reduce) { 934 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 935 "T#%d(%d:%d)\n", 936 gtid, team->t.t_id, tid, 937 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 938 child_tid)); 939 (*reduce)(this_thr->th.th_local.reduce_data, 940 child_thr->th.th_local.reduce_data); 941 } 942 } 943 } 944 } else { // Blocktime is not infinite 945 for (kmp_uint32 d = 0; d < thr_bar->my_level; 946 ++d) { // Gather lowest level threads first 947 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 948 skip = thr_bar->skip_per_level[d]; 949 if (last > nproc) 950 last = nproc; 951 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 952 kmp_info_t *child_thr = other_threads[child_tid]; 953 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 954 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 955 "T#%d(%d:%d) " 956 "arrived(%p) == %llu\n", 957 gtid, team->t.t_id, tid, 958 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 959 child_tid, &child_bar->b_arrived, new_state)); 960 kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 961 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 962 if (reduce) { 963 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 964 "T#%d(%d:%d)\n", 965 gtid, team->t.t_id, tid, 966 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 967 child_tid)); 968 (*reduce)(this_thr->th.th_local.reduce_data, 969 child_thr->th.th_local.reduce_data); 970 } 971 } 972 } 973 } 974 } 975 // All subordinates are gathered; now release parent if not primary thread 976 977 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy 978 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing" 979 " T#%d(%d:%d) arrived(%p): %llu => %llu\n", 980 gtid, team->t.t_id, tid, 981 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, 982 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 983 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 984 /* Mark arrival to parent: After performing this write, a worker thread may 985 not assume that the team is valid any more - it could be deallocated by 986 the primary thread at any time. */ 987 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || 988 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived 989 // flag; release it 990 kmp_flag_64<> flag(&thr_bar->b_arrived, 991 other_threads[thr_bar->parent_tid]); 992 flag.release(); 993 } else { 994 // Leaf does special release on "offset" bits of parent's b_arrived flag 995 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 996 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, 997 thr_bar->offset + 1); 998 flag.set_waiter(other_threads[thr_bar->parent_tid]); 999 flag.release(); 1000 } 1001 } else { // Primary thread needs to update the team's b_arrived value 1002 team->t.t_bar[bt].b_arrived = new_state; 1003 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d " 1004 "arrived(%p) = %llu\n", 1005 gtid, team->t.t_id, tid, team->t.t_id, 1006 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 1007 } 1008 // Is the team access below unsafe or just technically invalid? 1009 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for " 1010 "barrier type %d\n", 1011 gtid, team->t.t_id, tid, bt)); 1012 } 1013 1014 static void __kmp_hierarchical_barrier_release( 1015 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1016 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 1017 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release); 1018 kmp_team_t *team; 1019 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 1020 kmp_uint32 nproc; 1021 bool team_change = false; // indicates on-core barrier shouldn't be used 1022 1023 if (KMP_MASTER_TID(tid)) { 1024 team = __kmp_threads[gtid]->th.th_team; 1025 KMP_DEBUG_ASSERT(team != NULL); 1026 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary " 1027 "entered barrier type %d\n", 1028 gtid, team->t.t_id, tid, bt)); 1029 } else { // Worker threads 1030 // Wait for parent thread to release me 1031 if (!thr_bar->use_oncore_barrier || 1032 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 || 1033 thr_bar->team == NULL) { 1034 // Use traditional method of waiting on my own b_go flag 1035 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; 1036 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 1037 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1038 TCW_8(thr_bar->b_go, 1039 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1040 } else { // Thread barrier data is initialized, this is a leaf, blocktime is 1041 // infinite, not nested 1042 // Wait on my "offset" bits on parent's b_go flag 1043 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; 1044 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, 1045 thr_bar->offset + 1, bt, 1046 this_thr USE_ITT_BUILD_ARG(itt_sync_obj)); 1047 flag.wait(this_thr, TRUE); 1048 if (thr_bar->wait_flag == 1049 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go 1050 TCW_8(thr_bar->b_go, 1051 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1052 } else { // Reset my bits on parent's b_go flag 1053 (RCAST(volatile char *, 1054 &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0; 1055 } 1056 } 1057 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 1058 // Early exit for reaping threads releasing forkjoin barrier 1059 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 1060 return; 1061 // The worker thread may now assume that the team is valid. 1062 team = __kmp_threads[gtid]->th.th_team; 1063 KMP_DEBUG_ASSERT(team != NULL); 1064 tid = __kmp_tid_from_gtid(gtid); 1065 1066 KA_TRACE( 1067 20, 1068 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 1069 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 1070 KMP_MB(); // Flush all pending memory write invalidates. 1071 } 1072 1073 nproc = this_thr->th.th_team_nproc; 1074 int level = team->t.t_level; 1075 if (team->t.t_threads[0] 1076 ->th.th_teams_microtask) { // are we inside the teams construct? 1077 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 1078 this_thr->th.th_teams_level == level) 1079 ++level; // level was not increased in teams construct for team_of_workers 1080 if (this_thr->th.th_teams_size.nteams > 1) 1081 ++level; // level was not increased in teams construct for team_of_masters 1082 } 1083 if (level == 1) 1084 thr_bar->use_oncore_barrier = 1; 1085 else 1086 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 1087 1088 // If the team size has increased, we still communicate with old leaves via 1089 // oncore barrier. 1090 unsigned short int old_leaf_kids = thr_bar->leaf_kids; 1091 kmp_uint64 old_leaf_state = thr_bar->leaf_state; 1092 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, 1093 tid, team); 1094 // But if the entire team changes, we won't use oncore barrier at all 1095 if (team_change) 1096 old_leaf_kids = 0; 1097 1098 #if KMP_BARRIER_ICV_PUSH 1099 if (propagate_icvs) { 1100 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 1101 FALSE); 1102 if (KMP_MASTER_TID( 1103 tid)) { // primary already has copy in final destination; copy 1104 copy_icvs(&thr_bar->th_fixed_icvs, 1105 &team->t.t_implicit_task_taskdata[tid].td_icvs); 1106 } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1107 thr_bar->use_oncore_barrier) { // optimization for inf blocktime 1108 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) 1109 // leaves (on-core children) pull parent's fixed ICVs directly to local 1110 // ICV store 1111 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1112 &thr_bar->parent_bar->th_fixed_icvs); 1113 // non-leaves will get ICVs piggybacked with b_go via NGO store 1114 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs 1115 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can 1116 // access 1117 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs); 1118 else // leaves copy parent's fixed ICVs directly to local ICV store 1119 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1120 &thr_bar->parent_bar->th_fixed_icvs); 1121 } 1122 } 1123 #endif // KMP_BARRIER_ICV_PUSH 1124 1125 // Now, release my children 1126 if (thr_bar->my_level) { // not a leaf 1127 kmp_int32 child_tid; 1128 kmp_uint32 last; 1129 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1130 thr_bar->use_oncore_barrier) { 1131 if (KMP_MASTER_TID(tid)) { // do a flat release 1132 // Set local b_go to bump children via NGO store of the cache line 1133 // containing IVCs and b_go. 1134 thr_bar->b_go = KMP_BARRIER_STATE_BUMP; 1135 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of 1136 // the cache line 1137 ngo_load(&thr_bar->th_fixed_icvs); 1138 // This loops over all the threads skipping only the leaf nodes in the 1139 // hierarchy 1140 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc; 1141 child_tid += thr_bar->skip_per_level[1]) { 1142 kmp_bstate_t *child_bar = 1143 &team->t.t_threads[child_tid]->th.th_bar[bt].bb; 1144 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1145 "releasing T#%d(%d:%d)" 1146 " go(%p): %u => %u\n", 1147 gtid, team->t.t_id, tid, 1148 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1149 child_tid, &child_bar->b_go, child_bar->b_go, 1150 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1151 // Use ngo store (if available) to both store ICVs and release child 1152 // via child's b_go 1153 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 1154 } 1155 ngo_sync(); 1156 } 1157 TCW_8(thr_bar->b_go, 1158 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1159 // Now, release leaf children 1160 if (thr_bar->leaf_kids) { // if there are any 1161 // We test team_change on the off-chance that the level 1 team changed. 1162 if (team_change || 1163 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new 1164 if (old_leaf_kids) { // release old leaf kids 1165 thr_bar->b_go |= old_leaf_state; 1166 } 1167 // Release new leaf kids 1168 last = tid + thr_bar->skip_per_level[1]; 1169 if (last > nproc) 1170 last = nproc; 1171 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last; 1172 ++child_tid) { // skip_per_level[0]=1 1173 kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1174 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1175 KA_TRACE( 1176 20, 1177 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" 1178 " T#%d(%d:%d) go(%p): %u => %u\n", 1179 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1180 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 1181 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1182 // Release child using child's b_go flag 1183 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 1184 flag.release(); 1185 } 1186 } else { // Release all children at once with leaf_state bits on my own 1187 // b_go flag 1188 thr_bar->b_go |= thr_bar->leaf_state; 1189 } 1190 } 1191 } else { // Blocktime is not infinite; do a simple hierarchical release 1192 for (int d = thr_bar->my_level - 1; d >= 0; 1193 --d) { // Release highest level threads first 1194 last = tid + thr_bar->skip_per_level[d + 1]; 1195 kmp_uint32 skip = thr_bar->skip_per_level[d]; 1196 if (last > nproc) 1197 last = nproc; 1198 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 1199 kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1200 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1201 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1202 "releasing T#%d(%d:%d) go(%p): %u => %u\n", 1203 gtid, team->t.t_id, tid, 1204 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1205 child_tid, &child_bar->b_go, child_bar->b_go, 1206 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1207 // Release child using child's b_go flag 1208 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 1209 flag.release(); 1210 } 1211 } 1212 } 1213 #if KMP_BARRIER_ICV_PUSH 1214 if (propagate_icvs && !KMP_MASTER_TID(tid)) 1215 // non-leaves copy ICVs from fixed ICVs to local dest 1216 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1217 &thr_bar->th_fixed_icvs); 1218 #endif // KMP_BARRIER_ICV_PUSH 1219 } 1220 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for " 1221 "barrier type %d\n", 1222 gtid, team->t.t_id, tid, bt)); 1223 } 1224 1225 // End of Barrier Algorithms 1226 1227 // type traits for cancellable value 1228 // if cancellable is true, then is_cancellable is a normal boolean variable 1229 // if cancellable is false, then is_cancellable is a compile time constant 1230 template <bool cancellable> struct is_cancellable {}; 1231 template <> struct is_cancellable<true> { 1232 bool value; 1233 is_cancellable() : value(false) {} 1234 is_cancellable(bool b) : value(b) {} 1235 is_cancellable &operator=(bool b) { 1236 value = b; 1237 return *this; 1238 } 1239 operator bool() const { return value; } 1240 }; 1241 template <> struct is_cancellable<false> { 1242 is_cancellable &operator=(bool b) { return *this; } 1243 constexpr operator bool() const { return false; } 1244 }; 1245 1246 // Internal function to do a barrier. 1247 /* If is_split is true, do a split barrier, otherwise, do a plain barrier 1248 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split 1249 barrier 1250 When cancellable = false, 1251 Returns 0 if primary thread, 1 if worker thread. 1252 When cancellable = true 1253 Returns 0 if not cancelled, 1 if cancelled. */ 1254 template <bool cancellable = false> 1255 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, 1256 size_t reduce_size, void *reduce_data, 1257 void (*reduce)(void *, void *)) { 1258 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier); 1259 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 1260 int tid = __kmp_tid_from_gtid(gtid); 1261 kmp_info_t *this_thr = __kmp_threads[gtid]; 1262 kmp_team_t *team = this_thr->th.th_team; 1263 int status = 0; 1264 is_cancellable<cancellable> cancelled; 1265 #if OMPT_SUPPORT && OMPT_OPTIONAL 1266 ompt_data_t *my_task_data; 1267 ompt_data_t *my_parallel_data; 1268 void *return_address; 1269 ompt_sync_region_t barrier_kind; 1270 #endif 1271 1272 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid, 1273 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1274 1275 #if OMPT_SUPPORT 1276 if (ompt_enabled.enabled) { 1277 #if OMPT_OPTIONAL 1278 my_task_data = OMPT_CUR_TASK_DATA(this_thr); 1279 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); 1280 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1281 barrier_kind = __ompt_get_barrier_kind(bt, this_thr); 1282 if (ompt_enabled.ompt_callback_sync_region) { 1283 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1284 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, 1285 return_address); 1286 } 1287 if (ompt_enabled.ompt_callback_sync_region_wait) { 1288 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1289 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, 1290 return_address); 1291 } 1292 #endif 1293 // It is OK to report the barrier state after the barrier begin callback. 1294 // According to the OMPT specification, a compliant implementation may 1295 // even delay reporting this state until the barrier begins to wait. 1296 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; 1297 } 1298 #endif 1299 1300 if (!team->t.t_serialized) { 1301 #if USE_ITT_BUILD 1302 // This value will be used in itt notify events below. 1303 void *itt_sync_obj = NULL; 1304 #if USE_ITT_NOTIFY 1305 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1306 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1307 #endif 1308 #endif /* USE_ITT_BUILD */ 1309 if (__kmp_tasking_mode == tskm_extra_barrier) { 1310 __kmp_tasking_barrier(team, this_thr, gtid); 1311 KA_TRACE(15, 1312 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid, 1313 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1314 } 1315 1316 /* Copy the blocktime info to the thread, where __kmp_wait_template() can 1317 access it when the team struct is not guaranteed to exist. */ 1318 // See note about the corresponding code in __kmp_join_barrier() being 1319 // performance-critical. 1320 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1321 #if KMP_USE_MONITOR 1322 this_thr->th.th_team_bt_intervals = 1323 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1324 this_thr->th.th_team_bt_set = 1325 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1326 #else 1327 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 1328 #endif 1329 } 1330 1331 #if USE_ITT_BUILD 1332 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1333 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1334 #endif /* USE_ITT_BUILD */ 1335 #if USE_DEBUGGER 1336 // Let the debugger know: the thread arrived to the barrier and waiting. 1337 if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct 1338 team->t.t_bar[bt].b_master_arrived += 1; 1339 } else { 1340 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; 1341 } // if 1342 #endif /* USE_DEBUGGER */ 1343 if (reduce != NULL) { 1344 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 1345 this_thr->th.th_local.reduce_data = reduce_data; 1346 } 1347 1348 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) 1349 // use 0 to only setup the current team if nthreads > 1 1350 __kmp_task_team_setup(this_thr, team, 0); 1351 1352 if (cancellable) { 1353 cancelled = __kmp_linear_barrier_gather_cancellable( 1354 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1355 } else { 1356 switch (__kmp_barrier_gather_pattern[bt]) { 1357 case bp_hyper_bar: { 1358 // don't set branch bits to 0; use linear 1359 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); 1360 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, 1361 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1362 break; 1363 } 1364 case bp_hierarchical_bar: { 1365 __kmp_hierarchical_barrier_gather( 1366 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1367 break; 1368 } 1369 case bp_tree_bar: { 1370 // don't set branch bits to 0; use linear 1371 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); 1372 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, 1373 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1374 break; 1375 } 1376 default: { 1377 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, 1378 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1379 } 1380 } 1381 } 1382 1383 KMP_MB(); 1384 1385 if (KMP_MASTER_TID(tid)) { 1386 status = 0; 1387 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { 1388 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1389 } 1390 #if USE_DEBUGGER 1391 // Let the debugger know: All threads are arrived and starting leaving the 1392 // barrier. 1393 team->t.t_bar[bt].b_team_arrived += 1; 1394 #endif 1395 1396 if (__kmp_omp_cancellation) { 1397 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request); 1398 // Reset cancellation flag for worksharing constructs 1399 if (cancel_request == cancel_loop || 1400 cancel_request == cancel_sections) { 1401 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq); 1402 } 1403 } 1404 #if USE_ITT_BUILD 1405 /* TODO: In case of split reduction barrier, primary thread may send 1406 acquired event early, before the final summation into the shared 1407 variable is done (final summation can be a long operation for array 1408 reductions). */ 1409 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1410 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1411 #endif /* USE_ITT_BUILD */ 1412 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1413 // Barrier - report frame end (only if active_level == 1) 1414 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 1415 __kmp_forkjoin_frames_mode && 1416 (this_thr->th.th_teams_microtask == NULL || // either not in teams 1417 this_thr->th.th_teams_size.nteams == 1) && // or inside single team 1418 team->t.t_active_level == 1) { 1419 ident_t *loc = __kmp_threads[gtid]->th.th_ident; 1420 kmp_uint64 cur_time = __itt_get_timestamp(); 1421 kmp_info_t **other_threads = team->t.t_threads; 1422 int nproc = this_thr->th.th_team_nproc; 1423 int i; 1424 switch (__kmp_forkjoin_frames_mode) { 1425 case 1: 1426 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1427 loc, nproc); 1428 this_thr->th.th_frame_time = cur_time; 1429 break; 1430 case 2: // AC 2015-01-19: currently does not work for hierarchical (to 1431 // be fixed) 1432 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1433 1, loc, nproc); 1434 break; 1435 case 3: 1436 if (__itt_metadata_add_ptr) { 1437 // Initialize with primary thread's wait time 1438 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1439 // Set arrive time to zero to be able to check it in 1440 // __kmp_invoke_task(); the same is done inside the loop below 1441 this_thr->th.th_bar_arrive_time = 0; 1442 for (i = 1; i < nproc; ++i) { 1443 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 1444 other_threads[i]->th.th_bar_arrive_time = 0; 1445 } 1446 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 1447 cur_time, delta, 1448 (kmp_uint64)(reduce != NULL)); 1449 } 1450 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1451 loc, nproc); 1452 this_thr->th.th_frame_time = cur_time; 1453 break; 1454 } 1455 } 1456 #endif /* USE_ITT_BUILD */ 1457 } else { 1458 status = 1; 1459 #if USE_ITT_BUILD 1460 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1461 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1462 #endif /* USE_ITT_BUILD */ 1463 } 1464 if ((status == 1 || !is_split) && !cancelled) { 1465 if (cancellable) { 1466 cancelled = __kmp_linear_barrier_release_cancellable( 1467 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1468 } else { 1469 switch (__kmp_barrier_release_pattern[bt]) { 1470 case bp_hyper_bar: { 1471 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1472 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 1473 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1474 break; 1475 } 1476 case bp_hierarchical_bar: { 1477 __kmp_hierarchical_barrier_release( 1478 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1479 break; 1480 } 1481 case bp_tree_bar: { 1482 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1483 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 1484 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1485 break; 1486 } 1487 default: { 1488 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 1489 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1490 } 1491 } 1492 } 1493 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { 1494 __kmp_task_team_sync(this_thr, team); 1495 } 1496 } 1497 1498 #if USE_ITT_BUILD 1499 /* GEH: TODO: Move this under if-condition above and also include in 1500 __kmp_end_split_barrier(). This will more accurately represent the actual 1501 release time of the threads for split barriers. */ 1502 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1503 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1504 #endif /* USE_ITT_BUILD */ 1505 } else { // Team is serialized. 1506 status = 0; 1507 if (__kmp_tasking_mode != tskm_immediate_exec) { 1508 if (this_thr->th.th_task_team != NULL) { 1509 #if USE_ITT_NOTIFY 1510 void *itt_sync_obj = NULL; 1511 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1512 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1513 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1514 } 1515 #endif 1516 1517 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == 1518 TRUE); 1519 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1520 __kmp_task_team_setup(this_thr, team, 0); 1521 1522 #if USE_ITT_BUILD 1523 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1524 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1525 #endif /* USE_ITT_BUILD */ 1526 } 1527 } 1528 } 1529 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n", 1530 gtid, __kmp_team_from_gtid(gtid)->t.t_id, 1531 __kmp_tid_from_gtid(gtid), status)); 1532 1533 #if OMPT_SUPPORT 1534 if (ompt_enabled.enabled) { 1535 #if OMPT_OPTIONAL 1536 if (ompt_enabled.ompt_callback_sync_region_wait) { 1537 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1538 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, 1539 return_address); 1540 } 1541 if (ompt_enabled.ompt_callback_sync_region) { 1542 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1543 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, 1544 return_address); 1545 } 1546 #endif 1547 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1548 } 1549 #endif 1550 1551 if (cancellable) 1552 return (int)cancelled; 1553 return status; 1554 } 1555 1556 // Returns 0 if primary thread, 1 if worker thread. 1557 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split, 1558 size_t reduce_size, void *reduce_data, 1559 void (*reduce)(void *, void *)) { 1560 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data, 1561 reduce); 1562 } 1563 1564 #if defined(KMP_GOMP_COMPAT) 1565 // Returns 1 if cancelled, 0 otherwise 1566 int __kmp_barrier_gomp_cancel(int gtid) { 1567 if (__kmp_omp_cancellation) { 1568 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE, 1569 0, NULL, NULL); 1570 if (cancelled) { 1571 int tid = __kmp_tid_from_gtid(gtid); 1572 kmp_info_t *this_thr = __kmp_threads[gtid]; 1573 if (KMP_MASTER_TID(tid)) { 1574 // Primary thread does not need to revert anything 1575 } else { 1576 // Workers need to revert their private b_arrived flag 1577 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -= 1578 KMP_BARRIER_STATE_BUMP; 1579 } 1580 } 1581 return cancelled; 1582 } 1583 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 1584 return FALSE; 1585 } 1586 #endif 1587 1588 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) { 1589 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier); 1590 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 1591 KMP_DEBUG_ASSERT(bt < bs_last_barrier); 1592 int tid = __kmp_tid_from_gtid(gtid); 1593 kmp_info_t *this_thr = __kmp_threads[gtid]; 1594 kmp_team_t *team = this_thr->th.th_team; 1595 1596 if (!team->t.t_serialized) { 1597 if (KMP_MASTER_GTID(gtid)) { 1598 switch (__kmp_barrier_release_pattern[bt]) { 1599 case bp_hyper_bar: { 1600 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1601 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 1602 FALSE USE_ITT_BUILD_ARG(NULL)); 1603 break; 1604 } 1605 case bp_hierarchical_bar: { 1606 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, 1607 FALSE USE_ITT_BUILD_ARG(NULL)); 1608 break; 1609 } 1610 case bp_tree_bar: { 1611 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1612 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 1613 FALSE USE_ITT_BUILD_ARG(NULL)); 1614 break; 1615 } 1616 default: { 1617 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 1618 FALSE USE_ITT_BUILD_ARG(NULL)); 1619 } 1620 } 1621 if (__kmp_tasking_mode != tskm_immediate_exec) { 1622 __kmp_task_team_sync(this_thr, team); 1623 } // if 1624 } 1625 } 1626 } 1627 1628 void __kmp_join_barrier(int gtid) { 1629 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier); 1630 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 1631 1632 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 1633 1634 kmp_info_t *this_thr = __kmp_threads[gtid]; 1635 kmp_team_t *team; 1636 kmp_uint nproc; 1637 kmp_info_t *master_thread; 1638 int tid; 1639 #ifdef KMP_DEBUG 1640 int team_id; 1641 #endif /* KMP_DEBUG */ 1642 #if USE_ITT_BUILD 1643 void *itt_sync_obj = NULL; 1644 #if USE_ITT_NOTIFY 1645 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need 1646 // Get object created at fork_barrier 1647 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1648 #endif 1649 #endif /* USE_ITT_BUILD */ 1650 KMP_MB(); 1651 1652 // Get current info 1653 team = this_thr->th.th_team; 1654 nproc = this_thr->th.th_team_nproc; 1655 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc); 1656 tid = __kmp_tid_from_gtid(gtid); 1657 #ifdef KMP_DEBUG 1658 team_id = team->t.t_id; 1659 #endif /* KMP_DEBUG */ 1660 master_thread = this_thr->th.th_team_master; 1661 #ifdef KMP_DEBUG 1662 if (master_thread != team->t.t_threads[0]) { 1663 __kmp_print_structure(); 1664 } 1665 #endif /* KMP_DEBUG */ 1666 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); 1667 KMP_MB(); 1668 1669 // Verify state 1670 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); 1671 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); 1672 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); 1673 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", 1674 gtid, team_id, tid)); 1675 1676 #if OMPT_SUPPORT 1677 if (ompt_enabled.enabled) { 1678 #if OMPT_OPTIONAL 1679 ompt_data_t *my_task_data; 1680 ompt_data_t *my_parallel_data; 1681 void *codeptr = NULL; 1682 int ds_tid = this_thr->th.th_info.ds.ds_tid; 1683 if (KMP_MASTER_TID(ds_tid) && 1684 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 1685 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 1686 codeptr = team->t.ompt_team_info.master_return_address; 1687 my_task_data = OMPT_CUR_TASK_DATA(this_thr); 1688 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); 1689 if (ompt_enabled.ompt_callback_sync_region) { 1690 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1691 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data, 1692 my_task_data, codeptr); 1693 } 1694 if (ompt_enabled.ompt_callback_sync_region_wait) { 1695 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1696 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data, 1697 my_task_data, codeptr); 1698 } 1699 if (!KMP_MASTER_TID(ds_tid)) 1700 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr); 1701 #endif 1702 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit; 1703 } 1704 #endif 1705 1706 if (__kmp_tasking_mode == tskm_extra_barrier) { 1707 __kmp_tasking_barrier(team, this_thr, gtid); 1708 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, 1709 team_id, tid)); 1710 } 1711 #ifdef KMP_DEBUG 1712 if (__kmp_tasking_mode != tskm_immediate_exec) { 1713 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = " 1714 "%p, th_task_team = %p\n", 1715 __kmp_gtid_from_thread(this_thr), team_id, 1716 team->t.t_task_team[this_thr->th.th_task_state], 1717 this_thr->th.th_task_team)); 1718 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == 1719 team->t.t_task_team[this_thr->th.th_task_state]); 1720 } 1721 #endif /* KMP_DEBUG */ 1722 1723 /* Copy the blocktime info to the thread, where __kmp_wait_template() can 1724 access it when the team struct is not guaranteed to exist. Doing these 1725 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround, 1726 we do not perform the copy if blocktime=infinite, since the values are not 1727 used by __kmp_wait_template() in that case. */ 1728 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1729 #if KMP_USE_MONITOR 1730 this_thr->th.th_team_bt_intervals = 1731 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1732 this_thr->th.th_team_bt_set = 1733 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1734 #else 1735 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 1736 #endif 1737 } 1738 1739 #if USE_ITT_BUILD 1740 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1741 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1742 #endif /* USE_ITT_BUILD */ 1743 1744 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { 1745 case bp_hyper_bar: { 1746 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 1747 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1748 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1749 break; 1750 } 1751 case bp_hierarchical_bar: { 1752 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1753 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1754 break; 1755 } 1756 case bp_tree_bar: { 1757 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 1758 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1759 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1760 break; 1761 } 1762 default: { 1763 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1764 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1765 } 1766 } 1767 1768 /* From this point on, the team data structure may be deallocated at any time 1769 by the primary thread - it is unsafe to reference it in any of the worker 1770 threads. Any per-team data items that need to be referenced before the 1771 end of the barrier should be moved to the kmp_task_team_t structs. */ 1772 if (KMP_MASTER_TID(tid)) { 1773 if (__kmp_tasking_mode != tskm_immediate_exec) { 1774 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1775 } 1776 if (__kmp_display_affinity) { 1777 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0); 1778 } 1779 #if KMP_STATS_ENABLED 1780 // Have primary thread flag the workers to indicate they are now waiting for 1781 // next parallel region, Also wake them up so they switch their timers to 1782 // idle. 1783 for (int i = 0; i < team->t.t_nproc; ++i) { 1784 kmp_info_t *team_thread = team->t.t_threads[i]; 1785 if (team_thread == this_thr) 1786 continue; 1787 team_thread->th.th_stats->setIdleFlag(); 1788 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && 1789 team_thread->th.th_sleep_loc != NULL) 1790 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), 1791 team_thread->th.th_sleep_loc); 1792 } 1793 #endif 1794 #if USE_ITT_BUILD 1795 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1796 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1797 #endif /* USE_ITT_BUILD */ 1798 1799 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1800 // Join barrier - report frame end 1801 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 1802 __kmp_forkjoin_frames_mode && 1803 (this_thr->th.th_teams_microtask == NULL || // either not in teams 1804 this_thr->th.th_teams_size.nteams == 1) && // or inside single team 1805 team->t.t_active_level == 1) { 1806 kmp_uint64 cur_time = __itt_get_timestamp(); 1807 ident_t *loc = team->t.t_ident; 1808 kmp_info_t **other_threads = team->t.t_threads; 1809 int nproc = this_thr->th.th_team_nproc; 1810 int i; 1811 switch (__kmp_forkjoin_frames_mode) { 1812 case 1: 1813 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1814 loc, nproc); 1815 break; 1816 case 2: 1817 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, 1818 loc, nproc); 1819 break; 1820 case 3: 1821 if (__itt_metadata_add_ptr) { 1822 // Initialize with primary thread's wait time 1823 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1824 // Set arrive time to zero to be able to check it in 1825 // __kmp_invoke_task(); the same is done inside the loop below 1826 this_thr->th.th_bar_arrive_time = 0; 1827 for (i = 1; i < nproc; ++i) { 1828 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 1829 other_threads[i]->th.th_bar_arrive_time = 0; 1830 } 1831 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 1832 cur_time, delta, 0); 1833 } 1834 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1835 loc, nproc); 1836 this_thr->th.th_frame_time = cur_time; 1837 break; 1838 } 1839 } 1840 #endif /* USE_ITT_BUILD */ 1841 } 1842 #if USE_ITT_BUILD 1843 else { 1844 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1845 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1846 } 1847 #endif /* USE_ITT_BUILD */ 1848 1849 #if KMP_DEBUG 1850 if (KMP_MASTER_TID(tid)) { 1851 KA_TRACE( 1852 15, 1853 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n", 1854 gtid, team_id, tid, nproc)); 1855 } 1856 #endif /* KMP_DEBUG */ 1857 1858 // TODO now, mark worker threads as done so they may be disbanded 1859 KMP_MB(); // Flush all pending memory write invalidates. 1860 KA_TRACE(10, 1861 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid)); 1862 1863 } 1864 1865 // TODO release worker threads' fork barriers as we are ready instead of all at 1866 // once 1867 void __kmp_fork_barrier(int gtid, int tid) { 1868 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier); 1869 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 1870 kmp_info_t *this_thr = __kmp_threads[gtid]; 1871 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; 1872 #if USE_ITT_BUILD 1873 void *itt_sync_obj = NULL; 1874 #endif /* USE_ITT_BUILD */ 1875 if (team) 1876 1877 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid, 1878 (team != NULL) ? team->t.t_id : -1, tid)); 1879 1880 // th_team pointer only valid for primary thread here 1881 if (KMP_MASTER_TID(tid)) { 1882 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1883 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1884 // Create itt barrier object 1885 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1); 1886 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing 1887 } 1888 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1889 1890 #ifdef KMP_DEBUG 1891 KMP_DEBUG_ASSERT(team); 1892 kmp_info_t **other_threads = team->t.t_threads; 1893 int i; 1894 1895 // Verify state 1896 KMP_MB(); 1897 1898 for (i = 1; i < team->t.t_nproc; ++i) { 1899 KA_TRACE(500, 1900 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go " 1901 "== %u.\n", 1902 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, 1903 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, 1904 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); 1905 KMP_DEBUG_ASSERT( 1906 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) & 1907 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE); 1908 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); 1909 } 1910 #endif 1911 1912 if (__kmp_tasking_mode != tskm_immediate_exec) { 1913 // 0 indicates setup current task team if nthreads > 1 1914 __kmp_task_team_setup(this_thr, team, 0); 1915 } 1916 1917 /* The primary thread may have changed its blocktime between join barrier 1918 and fork barrier. Copy the blocktime info to the thread, where 1919 __kmp_wait_template() can access it when the team struct is not 1920 guaranteed to exist. */ 1921 // See note about the corresponding code in __kmp_join_barrier() being 1922 // performance-critical 1923 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1924 #if KMP_USE_MONITOR 1925 this_thr->th.th_team_bt_intervals = 1926 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1927 this_thr->th.th_team_bt_set = 1928 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1929 #else 1930 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 1931 #endif 1932 } 1933 } // primary thread 1934 1935 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { 1936 case bp_hyper_bar: { 1937 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 1938 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1939 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1940 break; 1941 } 1942 case bp_hierarchical_bar: { 1943 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1944 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1945 break; 1946 } 1947 case bp_tree_bar: { 1948 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 1949 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1950 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1951 break; 1952 } 1953 default: { 1954 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1955 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1956 } 1957 } 1958 1959 #if OMPT_SUPPORT 1960 if (ompt_enabled.enabled && 1961 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 1962 int ds_tid = this_thr->th.th_info.ds.ds_tid; 1963 ompt_data_t *task_data = (team) 1964 ? OMPT_CUR_TASK_DATA(this_thr) 1965 : &(this_thr->th.ompt_thread_info.task_data); 1966 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 1967 #if OMPT_OPTIONAL 1968 void *codeptr = NULL; 1969 if (KMP_MASTER_TID(ds_tid) && 1970 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 1971 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 1972 codeptr = team ? team->t.ompt_team_info.master_return_address : NULL; 1973 if (ompt_enabled.ompt_callback_sync_region_wait) { 1974 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1975 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 1976 codeptr); 1977 } 1978 if (ompt_enabled.ompt_callback_sync_region) { 1979 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1980 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 1981 codeptr); 1982 } 1983 #endif 1984 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 1985 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1986 ompt_scope_end, NULL, task_data, 0, ds_tid, 1987 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1988 } 1989 } 1990 #endif 1991 1992 // Early exit for reaping threads releasing forkjoin barrier 1993 if (TCR_4(__kmp_global.g.g_done)) { 1994 this_thr->th.th_task_team = NULL; 1995 1996 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1997 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1998 if (!KMP_MASTER_TID(tid)) { 1999 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2000 if (itt_sync_obj) 2001 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 2002 } 2003 } 2004 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2005 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid)); 2006 return; 2007 } 2008 2009 /* We can now assume that a valid team structure has been allocated by the 2010 primary thread and propagated to all worker threads. The current thread, 2011 however, may not be part of the team, so we can't blindly assume that the 2012 team pointer is non-null. */ 2013 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); 2014 KMP_DEBUG_ASSERT(team != NULL); 2015 tid = __kmp_tid_from_gtid(gtid); 2016 2017 #if KMP_BARRIER_ICV_PULL 2018 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in 2019 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's 2020 implicit task has this data before this function is called. We cannot 2021 modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's 2022 thread struct, because it is not always the case that the threads arrays 2023 have been allocated when __kmp_fork_call() is executed. */ 2024 { 2025 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 2026 if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs 2027 // Copy the initial ICVs from the primary thread's thread struct to the 2028 // implicit task for this tid. 2029 KA_TRACE(10, 2030 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); 2031 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, 2032 tid, FALSE); 2033 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 2034 &team->t.t_threads[0] 2035 ->th.th_bar[bs_forkjoin_barrier] 2036 .bb.th_fixed_icvs); 2037 } 2038 } 2039 #endif // KMP_BARRIER_ICV_PULL 2040 2041 if (__kmp_tasking_mode != tskm_immediate_exec) { 2042 __kmp_task_team_sync(this_thr, team); 2043 } 2044 2045 #if KMP_AFFINITY_SUPPORTED 2046 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 2047 if (proc_bind == proc_bind_intel) { 2048 // Call dynamic affinity settings 2049 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) { 2050 __kmp_balanced_affinity(this_thr, team->t.t_nproc); 2051 } 2052 } else if (proc_bind != proc_bind_false) { 2053 if (this_thr->th.th_new_place == this_thr->th.th_current_place) { 2054 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n", 2055 __kmp_gtid_from_thread(this_thr), 2056 this_thr->th.th_current_place)); 2057 } else { 2058 __kmp_affinity_set_place(gtid); 2059 } 2060 } 2061 #endif // KMP_AFFINITY_SUPPORTED 2062 // Perform the display affinity functionality 2063 if (__kmp_display_affinity) { 2064 if (team->t.t_display_affinity 2065 #if KMP_AFFINITY_SUPPORTED 2066 || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) 2067 #endif 2068 ) { 2069 // NULL means use the affinity-format-var ICV 2070 __kmp_aux_display_affinity(gtid, NULL); 2071 this_thr->th.th_prev_num_threads = team->t.t_nproc; 2072 this_thr->th.th_prev_level = team->t.t_level; 2073 } 2074 } 2075 if (!KMP_MASTER_TID(tid)) 2076 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator); 2077 2078 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2079 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2080 if (!KMP_MASTER_TID(tid)) { 2081 // Get correct barrier object 2082 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2083 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired 2084 } // (prepare called inside barrier_release) 2085 } 2086 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2087 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, 2088 team->t.t_id, tid)); 2089 } 2090 2091 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 2092 kmp_internal_control_t *new_icvs, ident_t *loc) { 2093 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy); 2094 2095 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); 2096 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 2097 2098 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in 2099 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's 2100 implicit task has this data before this function is called. */ 2101 #if KMP_BARRIER_ICV_PULL 2102 /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which 2103 remains untouched), where all of the worker threads can access them and 2104 make their own copies after the barrier. */ 2105 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 2106 // allocated at this point 2107 copy_icvs( 2108 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, 2109 new_icvs); 2110 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0, 2111 team->t.t_threads[0], team)); 2112 #elif KMP_BARRIER_ICV_PUSH 2113 // The ICVs will be propagated in the fork barrier, so nothing needs to be 2114 // done here. 2115 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0, 2116 team->t.t_threads[0], team)); 2117 #else 2118 // Copy the ICVs to each of the non-primary threads. This takes O(nthreads) 2119 // time. 2120 ngo_load(new_icvs); 2121 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 2122 // allocated at this point 2123 for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread 2124 // TODO: GEH - pass in better source location info since usually NULL here 2125 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 2126 f, team->t.t_threads[f], team)); 2127 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); 2128 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); 2129 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 2130 f, team->t.t_threads[f], team)); 2131 } 2132 ngo_sync(); 2133 #endif // KMP_BARRIER_ICV_PULL 2134 } 2135