1 /* 2 * kmp_dispatch_hier.h -- hierarchical scheduling methods and data structures 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef KMP_DISPATCH_HIER_H 14 #define KMP_DISPATCH_HIER_H 15 #include "kmp.h" 16 #include "kmp_dispatch.h" 17 18 // Layer type for scheduling hierarchy 19 enum kmp_hier_layer_e { 20 LAYER_THREAD = -1, 21 LAYER_L1, 22 LAYER_L2, 23 LAYER_L3, 24 LAYER_NUMA, 25 LAYER_LOOP, 26 LAYER_LAST 27 }; 28 29 // Convert hierarchy type (LAYER_L1, LAYER_L2, etc.) to C-style string 30 static inline const char *__kmp_get_hier_str(kmp_hier_layer_e type) { 31 switch (type) { 32 case kmp_hier_layer_e::LAYER_THREAD: 33 return "THREAD"; 34 case kmp_hier_layer_e::LAYER_L1: 35 return "L1"; 36 case kmp_hier_layer_e::LAYER_L2: 37 return "L2"; 38 case kmp_hier_layer_e::LAYER_L3: 39 return "L3"; 40 case kmp_hier_layer_e::LAYER_NUMA: 41 return "NUMA"; 42 case kmp_hier_layer_e::LAYER_LOOP: 43 return "WHOLE_LOOP"; 44 case kmp_hier_layer_e::LAYER_LAST: 45 return "LAST"; 46 } 47 KMP_ASSERT(0); 48 // Appease compilers, should never get here 49 return "ERROR"; 50 } 51 52 // Structure to store values parsed from OMP_SCHEDULE for scheduling hierarchy 53 typedef struct kmp_hier_sched_env_t { 54 int size; 55 int capacity; 56 enum sched_type *scheds; 57 kmp_int32 *small_chunks; 58 kmp_int64 *large_chunks; 59 kmp_hier_layer_e *layers; 60 // Append a level of the hierarchy 61 void append(enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer) { 62 if (capacity == 0) { 63 scheds = (enum sched_type *)__kmp_allocate(sizeof(enum sched_type) * 64 kmp_hier_layer_e::LAYER_LAST); 65 small_chunks = (kmp_int32 *)__kmp_allocate(sizeof(kmp_int32) * 66 kmp_hier_layer_e::LAYER_LAST); 67 large_chunks = (kmp_int64 *)__kmp_allocate(sizeof(kmp_int64) * 68 kmp_hier_layer_e::LAYER_LAST); 69 layers = (kmp_hier_layer_e *)__kmp_allocate(sizeof(kmp_hier_layer_e) * 70 kmp_hier_layer_e::LAYER_LAST); 71 capacity = kmp_hier_layer_e::LAYER_LAST; 72 } 73 int current_size = size; 74 KMP_DEBUG_ASSERT(current_size < kmp_hier_layer_e::LAYER_LAST); 75 scheds[current_size] = sched; 76 layers[current_size] = layer; 77 small_chunks[current_size] = chunk; 78 large_chunks[current_size] = (kmp_int64)chunk; 79 size++; 80 } 81 // Sort the hierarchy using selection sort, size will always be small 82 // (less than LAYER_LAST) so it is not necessary to use an nlog(n) algorithm 83 void sort() { 84 if (size <= 1) 85 return; 86 for (int i = 0; i < size; ++i) { 87 int switch_index = i; 88 for (int j = i + 1; j < size; ++j) { 89 if (layers[j] < layers[switch_index]) 90 switch_index = j; 91 } 92 if (switch_index != i) { 93 kmp_hier_layer_e temp1 = layers[i]; 94 enum sched_type temp2 = scheds[i]; 95 kmp_int32 temp3 = small_chunks[i]; 96 kmp_int64 temp4 = large_chunks[i]; 97 layers[i] = layers[switch_index]; 98 scheds[i] = scheds[switch_index]; 99 small_chunks[i] = small_chunks[switch_index]; 100 large_chunks[i] = large_chunks[switch_index]; 101 layers[switch_index] = temp1; 102 scheds[switch_index] = temp2; 103 small_chunks[switch_index] = temp3; 104 large_chunks[switch_index] = temp4; 105 } 106 } 107 } 108 // Free all memory 109 void deallocate() { 110 if (capacity > 0) { 111 __kmp_free(scheds); 112 __kmp_free(layers); 113 __kmp_free(small_chunks); 114 __kmp_free(large_chunks); 115 scheds = NULL; 116 layers = NULL; 117 small_chunks = NULL; 118 large_chunks = NULL; 119 } 120 size = 0; 121 capacity = 0; 122 } 123 } kmp_hier_sched_env_t; 124 125 extern int __kmp_dispatch_hand_threading; 126 extern kmp_hier_sched_env_t __kmp_hier_scheds; 127 128 // Sizes of layer arrays bounded by max number of detected L1s, L2s, etc. 129 extern int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1]; 130 extern int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1]; 131 132 extern int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type); 133 extern int __kmp_dispatch_get_id(int gtid, kmp_hier_layer_e type); 134 extern int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, 135 kmp_hier_layer_e t2); 136 extern void __kmp_dispatch_free_hierarchies(kmp_team_t *team); 137 138 template <typename T> struct kmp_hier_shared_bdata_t { 139 typedef typename traits_t<T>::signed_t ST; 140 volatile kmp_uint64 val[2]; 141 kmp_int32 status[2]; 142 T lb[2]; 143 T ub[2]; 144 ST st[2]; 145 dispatch_shared_info_template<T> sh[2]; 146 void zero() { 147 val[0] = val[1] = 0; 148 status[0] = status[1] = 0; 149 lb[0] = lb[1] = 0; 150 ub[0] = ub[1] = 0; 151 st[0] = st[1] = 0; 152 sh[0].u.s.iteration = sh[1].u.s.iteration = 0; 153 } 154 void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus, 155 kmp_uint64 index) { 156 lb[1 - index] = nlb; 157 ub[1 - index] = nub; 158 st[1 - index] = nst; 159 status[1 - index] = nstatus; 160 } 161 void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index) { 162 lb[1 - index] = nlb; 163 ub[1 - index] = nub; 164 st[1 - index] = nst; 165 status[1 - index] = nstatus; 166 sh[1 - index].u.s.iteration = 0; 167 } 168 169 kmp_int32 get_next_status(kmp_uint64 index) const { 170 return status[1 - index]; 171 } 172 T get_next_lb(kmp_uint64 index) const { return lb[1 - index]; } 173 T get_next_ub(kmp_uint64 index) const { return ub[1 - index]; } 174 ST get_next_st(kmp_uint64 index) const { return st[1 - index]; } 175 dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) { 176 return &(sh[1 - index]); 177 } 178 179 kmp_int32 get_curr_status(kmp_uint64 index) const { return status[index]; } 180 T get_curr_lb(kmp_uint64 index) const { return lb[index]; } 181 T get_curr_ub(kmp_uint64 index) const { return ub[index]; } 182 ST get_curr_st(kmp_uint64 index) const { return st[index]; } 183 dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) { 184 return &(sh[index]); 185 } 186 }; 187 188 /* 189 * In the barrier implementations, num_active is the number of threads that are 190 * attached to the kmp_hier_top_unit_t structure in the scheduling hierarchy. 191 * bdata is the shared barrier data that resides on the kmp_hier_top_unit_t 192 * structure. tdata is the thread private data that resides on the thread 193 * data structure. 194 * 195 * The reset_shared() method is used to initialize the barrier data on the 196 * kmp_hier_top_unit_t hierarchy structure 197 * 198 * The reset_private() method is used to initialize the barrier data on the 199 * thread's private dispatch buffer structure 200 * 201 * The barrier() method takes an id, which is that thread's id for the 202 * kmp_hier_top_unit_t structure, and implements the barrier. All threads wait 203 * inside barrier() until all fellow threads who are attached to that 204 * kmp_hier_top_unit_t structure have arrived. 205 */ 206 207 // Core barrier implementation 208 // Can be used in a unit with between 2 to 8 threads 209 template <typename T> class core_barrier_impl { 210 static inline kmp_uint64 get_wait_val(int num_active) { 211 kmp_uint64 wait_val = 0LL; 212 switch (num_active) { 213 case 2: 214 wait_val = 0x0101LL; 215 break; 216 case 3: 217 wait_val = 0x010101LL; 218 break; 219 case 4: 220 wait_val = 0x01010101LL; 221 break; 222 case 5: 223 wait_val = 0x0101010101LL; 224 break; 225 case 6: 226 wait_val = 0x010101010101LL; 227 break; 228 case 7: 229 wait_val = 0x01010101010101LL; 230 break; 231 case 8: 232 wait_val = 0x0101010101010101LL; 233 break; 234 default: 235 // don't use the core_barrier_impl for more than 8 threads 236 KMP_ASSERT(0); 237 } 238 return wait_val; 239 } 240 241 public: 242 static void reset_private(kmp_int32 num_active, 243 kmp_hier_private_bdata_t *tdata); 244 static void reset_shared(kmp_int32 num_active, 245 kmp_hier_shared_bdata_t<T> *bdata); 246 static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata, 247 kmp_hier_private_bdata_t *tdata); 248 }; 249 250 template <typename T> 251 void core_barrier_impl<T>::reset_private(kmp_int32 num_active, 252 kmp_hier_private_bdata_t *tdata) { 253 tdata->num_active = num_active; 254 tdata->index = 0; 255 tdata->wait_val[0] = tdata->wait_val[1] = get_wait_val(num_active); 256 } 257 template <typename T> 258 void core_barrier_impl<T>::reset_shared(kmp_int32 num_active, 259 kmp_hier_shared_bdata_t<T> *bdata) { 260 bdata->val[0] = bdata->val[1] = 0LL; 261 bdata->status[0] = bdata->status[1] = 0LL; 262 } 263 template <typename T> 264 void core_barrier_impl<T>::barrier(kmp_int32 id, 265 kmp_hier_shared_bdata_t<T> *bdata, 266 kmp_hier_private_bdata_t *tdata) { 267 kmp_uint64 current_index = tdata->index; 268 kmp_uint64 next_index = 1 - current_index; 269 kmp_uint64 current_wait_value = tdata->wait_val[current_index]; 270 kmp_uint64 next_wait_value = 271 (current_wait_value ? 0 : get_wait_val(tdata->num_active)); 272 KD_TRACE(10, ("core_barrier_impl::barrier(): T#%d current_index:%llu " 273 "next_index:%llu curr_wait:%llu next_wait:%llu\n", 274 __kmp_get_gtid(), current_index, next_index, current_wait_value, 275 next_wait_value)); 276 char v = (current_wait_value ? '\1' : '\0'); 277 (RCAST(volatile char *, &(bdata->val[current_index])))[id] = v; 278 __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value, 279 __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL)); 280 tdata->wait_val[current_index] = next_wait_value; 281 tdata->index = next_index; 282 } 283 284 // Counter barrier implementation 285 // Can be used in a unit with arbitrary number of active threads 286 template <typename T> class counter_barrier_impl { 287 public: 288 static void reset_private(kmp_int32 num_active, 289 kmp_hier_private_bdata_t *tdata); 290 static void reset_shared(kmp_int32 num_active, 291 kmp_hier_shared_bdata_t<T> *bdata); 292 static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata, 293 kmp_hier_private_bdata_t *tdata); 294 }; 295 296 template <typename T> 297 void counter_barrier_impl<T>::reset_private(kmp_int32 num_active, 298 kmp_hier_private_bdata_t *tdata) { 299 tdata->num_active = num_active; 300 tdata->index = 0; 301 tdata->wait_val[0] = tdata->wait_val[1] = (kmp_uint64)num_active; 302 } 303 template <typename T> 304 void counter_barrier_impl<T>::reset_shared(kmp_int32 num_active, 305 kmp_hier_shared_bdata_t<T> *bdata) { 306 bdata->val[0] = bdata->val[1] = 0LL; 307 bdata->status[0] = bdata->status[1] = 0LL; 308 } 309 template <typename T> 310 void counter_barrier_impl<T>::barrier(kmp_int32 id, 311 kmp_hier_shared_bdata_t<T> *bdata, 312 kmp_hier_private_bdata_t *tdata) { 313 volatile kmp_int64 *val; 314 kmp_uint64 current_index = tdata->index; 315 kmp_uint64 next_index = 1 - current_index; 316 kmp_uint64 current_wait_value = tdata->wait_val[current_index]; 317 kmp_uint64 next_wait_value = current_wait_value + tdata->num_active; 318 319 KD_TRACE(10, ("counter_barrier_impl::barrier(): T#%d current_index:%llu " 320 "next_index:%llu curr_wait:%llu next_wait:%llu\n", 321 __kmp_get_gtid(), current_index, next_index, current_wait_value, 322 next_wait_value)); 323 val = RCAST(volatile kmp_int64 *, &(bdata->val[current_index])); 324 KMP_TEST_THEN_INC64(val); 325 __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value, 326 __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL)); 327 tdata->wait_val[current_index] = next_wait_value; 328 tdata->index = next_index; 329 } 330 331 // Data associated with topology unit within a layer 332 // For example, one kmp_hier_top_unit_t corresponds to one L1 cache 333 template <typename T> struct kmp_hier_top_unit_t { 334 typedef typename traits_t<T>::signed_t ST; 335 typedef typename traits_t<T>::unsigned_t UT; 336 kmp_int32 active; // number of topology units that communicate with this unit 337 // chunk information (lower/upper bound, stride, etc.) 338 dispatch_private_info_template<T> hier_pr; 339 kmp_hier_top_unit_t<T> *hier_parent; // pointer to parent unit 340 kmp_hier_shared_bdata_t<T> hier_barrier; // shared barrier data for this unit 341 342 kmp_int32 get_hier_id() const { return hier_pr.hier_id; } 343 void reset_shared_barrier() { 344 KMP_DEBUG_ASSERT(active > 0); 345 if (active == 1) 346 return; 347 hier_barrier.zero(); 348 if (active >= 2 && active <= 8) { 349 core_barrier_impl<T>::reset_shared(active, &hier_barrier); 350 } else { 351 counter_barrier_impl<T>::reset_shared(active, &hier_barrier); 352 } 353 } 354 void reset_private_barrier(kmp_hier_private_bdata_t *tdata) { 355 KMP_DEBUG_ASSERT(tdata); 356 KMP_DEBUG_ASSERT(active > 0); 357 if (active == 1) 358 return; 359 if (active >= 2 && active <= 8) { 360 core_barrier_impl<T>::reset_private(active, tdata); 361 } else { 362 counter_barrier_impl<T>::reset_private(active, tdata); 363 } 364 } 365 void barrier(kmp_int32 id, kmp_hier_private_bdata_t *tdata) { 366 KMP_DEBUG_ASSERT(tdata); 367 KMP_DEBUG_ASSERT(active > 0); 368 KMP_DEBUG_ASSERT(id >= 0 && id < active); 369 if (active == 1) { 370 tdata->index = 1 - tdata->index; 371 return; 372 } 373 if (active >= 2 && active <= 8) { 374 core_barrier_impl<T>::barrier(id, &hier_barrier, tdata); 375 } else { 376 counter_barrier_impl<T>::barrier(id, &hier_barrier, tdata); 377 } 378 } 379 380 kmp_int32 get_next_status(kmp_uint64 index) const { 381 return hier_barrier.get_next_status(index); 382 } 383 T get_next_lb(kmp_uint64 index) const { 384 return hier_barrier.get_next_lb(index); 385 } 386 T get_next_ub(kmp_uint64 index) const { 387 return hier_barrier.get_next_ub(index); 388 } 389 ST get_next_st(kmp_uint64 index) const { 390 return hier_barrier.get_next_st(index); 391 } 392 dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) { 393 return hier_barrier.get_next_sh(index); 394 } 395 396 kmp_int32 get_curr_status(kmp_uint64 index) const { 397 return hier_barrier.get_curr_status(index); 398 } 399 T get_curr_lb(kmp_uint64 index) const { 400 return hier_barrier.get_curr_lb(index); 401 } 402 T get_curr_ub(kmp_uint64 index) const { 403 return hier_barrier.get_curr_ub(index); 404 } 405 ST get_curr_st(kmp_uint64 index) const { 406 return hier_barrier.get_curr_st(index); 407 } 408 dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) { 409 return hier_barrier.get_curr_sh(index); 410 } 411 412 void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status, 413 kmp_uint64 index) { 414 hier_barrier.set_next_hand_thread(lb, ub, st, status, index); 415 } 416 void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index) { 417 hier_barrier.set_next(lb, ub, st, status, index); 418 } 419 dispatch_private_info_template<T> *get_my_pr() { return &hier_pr; } 420 kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; } 421 dispatch_private_info_template<T> *get_parent_pr() { 422 return &(hier_parent->hier_pr); 423 } 424 425 kmp_int32 is_active() const { return active; } 426 kmp_int32 get_num_active() const { return active; } 427 #ifdef KMP_DEBUG 428 void print() { 429 KD_TRACE( 430 10, 431 (" kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n", 432 active, &hier_pr, hier_pr.u.p.lb, hier_pr.u.p.ub, hier_pr.u.p.st, 433 hier_pr.u.p.tc)); 434 } 435 #endif 436 }; 437 438 // Information regarding a single layer within the scheduling hierarchy 439 template <typename T> struct kmp_hier_layer_info_t { 440 int num_active; // number of threads active in this level 441 kmp_hier_layer_e type; // LAYER_L1, LAYER_L2, etc. 442 enum sched_type sched; // static, dynamic, guided, etc. 443 typename traits_t<T>::signed_t chunk; // chunk size associated with schedule 444 int length; // length of the kmp_hier_top_unit_t array 445 446 #ifdef KMP_DEBUG 447 // Print this layer's information 448 void print() { 449 const char *t = __kmp_get_hier_str(type); 450 KD_TRACE( 451 10, 452 (" kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d " 453 "length:%d\n", 454 num_active, t, sched, chunk, length)); 455 } 456 #endif 457 }; 458 459 /* 460 * Structure to implement entire hierarchy 461 * 462 * The hierarchy is kept as an array of arrays to represent the different 463 * layers. Layer 0 is the lowest layer to layer num_layers - 1 which is the 464 * highest layer. 465 * Example: 466 * [ 2 ] -> [ L3 | L3 ] 467 * [ 1 ] -> [ L2 | L2 | L2 | L2 ] 468 * [ 0 ] -> [ L1 | L1 | L1 | L1 | L1 | L1 | L1 | L1 ] 469 * There is also an array of layer_info_t which has information regarding 470 * each layer 471 */ 472 template <typename T> struct kmp_hier_t { 473 public: 474 typedef typename traits_t<T>::unsigned_t UT; 475 typedef typename traits_t<T>::signed_t ST; 476 477 private: 478 int next_recurse(ident_t *loc, int gtid, kmp_hier_top_unit_t<T> *current, 479 kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st, 480 kmp_int32 previous_id, int hier_level) { 481 int status; 482 kmp_info_t *th = __kmp_threads[gtid]; 483 auto parent = current->get_parent(); 484 bool last_layer = (hier_level == get_num_layers() - 1); 485 KMP_DEBUG_ASSERT(th); 486 kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[hier_level]); 487 KMP_DEBUG_ASSERT(current); 488 KMP_DEBUG_ASSERT(hier_level >= 0); 489 KMP_DEBUG_ASSERT(hier_level < get_num_layers()); 490 KMP_DEBUG_ASSERT(tdata); 491 KMP_DEBUG_ASSERT(parent || last_layer); 492 493 KD_TRACE( 494 1, ("kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid, hier_level)); 495 496 T hier_id = (T)current->get_hier_id(); 497 // Attempt to grab next iteration range for this level 498 if (previous_id == 0) { 499 KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is master of unit\n", 500 gtid, hier_level)); 501 kmp_int32 contains_last; 502 T my_lb, my_ub; 503 ST my_st; 504 T nproc; 505 dispatch_shared_info_template<T> volatile *my_sh; 506 dispatch_private_info_template<T> *my_pr; 507 if (last_layer) { 508 // last layer below the very top uses the single shared buffer 509 // from the team struct. 510 KD_TRACE(10, 511 ("kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n", 512 gtid, hier_level)); 513 my_sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 514 th->th.th_dispatch->th_dispatch_sh_current); 515 nproc = (T)get_top_level_nproc(); 516 } else { 517 // middle layers use the shared buffer inside the kmp_hier_top_unit_t 518 // structure 519 KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n", 520 gtid, hier_level)); 521 my_sh = 522 parent->get_curr_sh(th->th.th_hier_bar_data[hier_level + 1].index); 523 nproc = (T)parent->get_num_active(); 524 } 525 my_pr = current->get_my_pr(); 526 KMP_DEBUG_ASSERT(my_sh); 527 KMP_DEBUG_ASSERT(my_pr); 528 enum sched_type schedule = get_sched(hier_level); 529 ST chunk = (ST)get_chunk(hier_level); 530 status = __kmp_dispatch_next_algorithm<T>(gtid, my_pr, my_sh, 531 &contains_last, &my_lb, &my_ub, 532 &my_st, nproc, hier_id); 533 KD_TRACE( 534 10, 535 ("kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n", 536 gtid, hier_level, status)); 537 // When no iterations are found (status == 0) and this is not the last 538 // layer, attempt to go up the hierarchy for more iterations 539 if (status == 0 && !last_layer) { 540 kmp_int32 hid; 541 __kmp_type_convert(hier_id, &hid); 542 status = next_recurse(loc, gtid, parent, &contains_last, &my_lb, &my_ub, 543 &my_st, hid, hier_level + 1); 544 KD_TRACE( 545 10, 546 ("kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n", 547 gtid, hier_level, status)); 548 if (status == 1) { 549 kmp_hier_private_bdata_t *upper_tdata = 550 &(th->th.th_hier_bar_data[hier_level + 1]); 551 my_sh = parent->get_curr_sh(upper_tdata->index); 552 KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) about to init\n", 553 gtid, hier_level)); 554 __kmp_dispatch_init_algorithm(loc, gtid, my_pr, schedule, 555 parent->get_curr_lb(upper_tdata->index), 556 parent->get_curr_ub(upper_tdata->index), 557 parent->get_curr_st(upper_tdata->index), 558 #if USE_ITT_BUILD 559 NULL, 560 #endif 561 chunk, nproc, hier_id); 562 status = __kmp_dispatch_next_algorithm<T>( 563 gtid, my_pr, my_sh, &contains_last, &my_lb, &my_ub, &my_st, nproc, 564 hier_id); 565 if (!status) { 566 KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) status not 1 " 567 "setting to 2!\n", 568 gtid, hier_level)); 569 status = 2; 570 } 571 } 572 } 573 current->set_next(my_lb, my_ub, my_st, status, tdata->index); 574 // Propagate whether a unit holds the actual global last iteration 575 // The contains_last attribute is sent downwards from the top to the 576 // bottom of the hierarchy via the contains_last flag inside the 577 // private dispatch buffers in the hierarchy's middle layers 578 if (contains_last) { 579 // If the next_algorithm() method returns 1 for p_last and it is the 580 // last layer or our parent contains the last serial chunk, then the 581 // chunk must contain the last serial iteration. 582 if (last_layer || parent->hier_pr.flags.contains_last) { 583 KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr " 584 "to contain last.\n", 585 gtid, hier_level)); 586 current->hier_pr.flags.contains_last = contains_last; 587 } 588 if (!current->hier_pr.flags.contains_last) 589 contains_last = FALSE; 590 } 591 if (p_last) 592 *p_last = contains_last; 593 } // if master thread of this unit 594 if (hier_level > 0 || !__kmp_dispatch_hand_threading) { 595 KD_TRACE(10, 596 ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n", 597 gtid, hier_level)); 598 current->barrier(previous_id, tdata); 599 KD_TRACE(10, 600 ("kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n", 601 gtid, hier_level, current->get_curr_status(tdata->index))); 602 } else { 603 KMP_DEBUG_ASSERT(previous_id == 0); 604 return status; 605 } 606 return current->get_curr_status(tdata->index); 607 } 608 609 public: 610 int top_level_nproc; 611 int num_layers; 612 bool valid; 613 int type_size; 614 kmp_hier_layer_info_t<T> *info; 615 kmp_hier_top_unit_t<T> **layers; 616 // Deallocate all memory from this hierarchy 617 void deallocate() { 618 for (int i = 0; i < num_layers; ++i) 619 if (layers[i] != NULL) { 620 __kmp_free(layers[i]); 621 } 622 if (layers != NULL) { 623 __kmp_free(layers); 624 layers = NULL; 625 } 626 if (info != NULL) { 627 __kmp_free(info); 628 info = NULL; 629 } 630 num_layers = 0; 631 valid = false; 632 } 633 // Returns true if reallocation is needed else false 634 bool need_to_reallocate(int n, const kmp_hier_layer_e *new_layers, 635 const enum sched_type *new_scheds, 636 const ST *new_chunks) const { 637 if (!valid || layers == NULL || info == NULL || 638 traits_t<T>::type_size != type_size || n != num_layers) 639 return true; 640 for (int i = 0; i < n; ++i) { 641 if (info[i].type != new_layers[i]) 642 return true; 643 if (info[i].sched != new_scheds[i]) 644 return true; 645 if (info[i].chunk != new_chunks[i]) 646 return true; 647 } 648 return false; 649 } 650 // A single thread should call this function while the other threads wait 651 // create a new scheduling hierarchy consisting of new_layers, new_scheds 652 // and new_chunks. These should come pre-sorted according to 653 // kmp_hier_layer_e value. This function will try to avoid reallocation 654 // if it can 655 void allocate_hier(int n, const kmp_hier_layer_e *new_layers, 656 const enum sched_type *new_scheds, const ST *new_chunks) { 657 top_level_nproc = 0; 658 if (!need_to_reallocate(n, new_layers, new_scheds, new_chunks)) { 659 KD_TRACE( 660 10, 661 ("kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n")); 662 for (int i = 0; i < n; ++i) { 663 info[i].num_active = 0; 664 for (int j = 0; j < get_length(i); ++j) 665 layers[i][j].active = 0; 666 } 667 return; 668 } 669 KD_TRACE(10, ("kmp_hier_t<T>::allocate_hier: T#0 full alloc\n")); 670 deallocate(); 671 type_size = traits_t<T>::type_size; 672 num_layers = n; 673 info = (kmp_hier_layer_info_t<T> *)__kmp_allocate( 674 sizeof(kmp_hier_layer_info_t<T>) * n); 675 layers = (kmp_hier_top_unit_t<T> **)__kmp_allocate( 676 sizeof(kmp_hier_top_unit_t<T> *) * n); 677 for (int i = 0; i < n; ++i) { 678 int max = 0; 679 kmp_hier_layer_e layer = new_layers[i]; 680 info[i].num_active = 0; 681 info[i].type = layer; 682 info[i].sched = new_scheds[i]; 683 info[i].chunk = new_chunks[i]; 684 max = __kmp_hier_max_units[layer + 1]; 685 if (max == 0) { 686 valid = false; 687 KMP_WARNING(HierSchedInvalid, __kmp_get_hier_str(layer)); 688 deallocate(); 689 return; 690 } 691 info[i].length = max; 692 layers[i] = (kmp_hier_top_unit_t<T> *)__kmp_allocate( 693 sizeof(kmp_hier_top_unit_t<T>) * max); 694 for (int j = 0; j < max; ++j) { 695 layers[i][j].active = 0; 696 layers[i][j].hier_pr.flags.use_hier = TRUE; 697 } 698 } 699 valid = true; 700 } 701 // loc - source file location 702 // gtid - global thread identifier 703 // pr - this thread's private dispatch buffer (corresponding with gtid) 704 // p_last (return value) - pointer to flag indicating this set of iterations 705 // contains last 706 // iteration 707 // p_lb (return value) - lower bound for this chunk of iterations 708 // p_ub (return value) - upper bound for this chunk of iterations 709 // p_st (return value) - stride for this chunk of iterations 710 // 711 // Returns 1 if there are more iterations to perform, 0 otherwise 712 int next(ident_t *loc, int gtid, dispatch_private_info_template<T> *pr, 713 kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st) { 714 int status; 715 kmp_int32 contains_last = 0; 716 kmp_info_t *th = __kmp_threads[gtid]; 717 kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[0]); 718 auto parent = pr->get_parent(); 719 KMP_DEBUG_ASSERT(parent); 720 KMP_DEBUG_ASSERT(th); 721 KMP_DEBUG_ASSERT(tdata); 722 KMP_DEBUG_ASSERT(parent); 723 T nproc = (T)parent->get_num_active(); 724 T unit_id = (T)pr->get_hier_id(); 725 KD_TRACE( 726 10, 727 ("kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n", 728 gtid, nproc, unit_id)); 729 // Handthreading implementation 730 // Each iteration is performed by all threads on last unit (typically 731 // cores/tiles) 732 // e.g., threads 0,1,2,3 all execute iteration 0 733 // threads 0,1,2,3 all execute iteration 1 734 // threads 4,5,6,7 all execute iteration 2 735 // threads 4,5,6,7 all execute iteration 3 736 // ... etc. 737 if (__kmp_dispatch_hand_threading) { 738 KD_TRACE(10, 739 ("kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n", 740 gtid)); 741 if (unit_id == 0) { 742 // For hand threading, the sh buffer on the lowest level is only ever 743 // modified and read by the master thread on that level. Because of 744 // this, we can always use the first sh buffer. 745 auto sh = &(parent->hier_barrier.sh[0]); 746 KMP_DEBUG_ASSERT(sh); 747 status = __kmp_dispatch_next_algorithm<T>( 748 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id); 749 if (!status) { 750 bool done = false; 751 while (!done) { 752 done = true; 753 kmp_int32 uid; 754 __kmp_type_convert(unit_id, &uid); 755 status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub, 756 p_st, uid, 0); 757 if (status == 1) { 758 __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule, 759 parent->get_next_lb(tdata->index), 760 parent->get_next_ub(tdata->index), 761 parent->get_next_st(tdata->index), 762 #if USE_ITT_BUILD 763 NULL, 764 #endif 765 pr->u.p.parm1, nproc, unit_id); 766 sh->u.s.iteration = 0; 767 status = __kmp_dispatch_next_algorithm<T>( 768 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, 769 unit_id); 770 if (!status) { 771 KD_TRACE(10, 772 ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 " 773 "after next_pr_sh()" 774 "trying again.\n", 775 gtid)); 776 done = false; 777 } 778 } else if (status == 2) { 779 KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 " 780 "trying again.\n", 781 gtid)); 782 done = false; 783 } 784 } 785 } 786 parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index); 787 } // if master thread of lowest unit level 788 parent->barrier(pr->get_hier_id(), tdata); 789 if (unit_id != 0) { 790 *p_lb = parent->get_curr_lb(tdata->index); 791 *p_ub = parent->get_curr_ub(tdata->index); 792 *p_st = parent->get_curr_st(tdata->index); 793 status = parent->get_curr_status(tdata->index); 794 } 795 } else { 796 // Normal implementation 797 // Each thread grabs an iteration chunk and executes it (no cooperation) 798 auto sh = parent->get_curr_sh(tdata->index); 799 KMP_DEBUG_ASSERT(sh); 800 status = __kmp_dispatch_next_algorithm<T>( 801 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id); 802 KD_TRACE(10, 803 ("kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d " 804 "contains_last:%d p_lb:%d p_ub:%d p_st:%d\n", 805 gtid, status, contains_last, *p_lb, *p_ub, *p_st)); 806 if (!status) { 807 bool done = false; 808 while (!done) { 809 done = true; 810 kmp_int32 uid; 811 __kmp_type_convert(unit_id, &uid); 812 status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub, 813 p_st, uid, 0); 814 if (status == 1) { 815 sh = parent->get_curr_sh(tdata->index); 816 __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule, 817 parent->get_curr_lb(tdata->index), 818 parent->get_curr_ub(tdata->index), 819 parent->get_curr_st(tdata->index), 820 #if USE_ITT_BUILD 821 NULL, 822 #endif 823 pr->u.p.parm1, nproc, unit_id); 824 status = __kmp_dispatch_next_algorithm<T>( 825 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id); 826 if (!status) { 827 KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 " 828 "after next_pr_sh()" 829 "trying again.\n", 830 gtid)); 831 done = false; 832 } 833 } else if (status == 2) { 834 KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 " 835 "trying again.\n", 836 gtid)); 837 done = false; 838 } 839 } 840 } 841 } 842 if (contains_last && !parent->hier_pr.flags.contains_last) { 843 KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL resetting " 844 "contains_last to FALSE\n", 845 gtid)); 846 contains_last = FALSE; 847 } 848 if (p_last) 849 *p_last = contains_last; 850 KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid, 851 status)); 852 return status; 853 } 854 // These functions probe the layer info structure 855 // Returns the type of topology unit given level 856 kmp_hier_layer_e get_type(int level) const { 857 KMP_DEBUG_ASSERT(level >= 0); 858 KMP_DEBUG_ASSERT(level < num_layers); 859 return info[level].type; 860 } 861 // Returns the schedule type at given level 862 enum sched_type get_sched(int level) const { 863 KMP_DEBUG_ASSERT(level >= 0); 864 KMP_DEBUG_ASSERT(level < num_layers); 865 return info[level].sched; 866 } 867 // Returns the chunk size at given level 868 ST get_chunk(int level) const { 869 KMP_DEBUG_ASSERT(level >= 0); 870 KMP_DEBUG_ASSERT(level < num_layers); 871 return info[level].chunk; 872 } 873 // Returns the number of active threads at given level 874 int get_num_active(int level) const { 875 KMP_DEBUG_ASSERT(level >= 0); 876 KMP_DEBUG_ASSERT(level < num_layers); 877 return info[level].num_active; 878 } 879 // Returns the length of topology unit array at given level 880 int get_length(int level) const { 881 KMP_DEBUG_ASSERT(level >= 0); 882 KMP_DEBUG_ASSERT(level < num_layers); 883 return info[level].length; 884 } 885 // Returns the topology unit given the level and index 886 kmp_hier_top_unit_t<T> *get_unit(int level, int index) { 887 KMP_DEBUG_ASSERT(level >= 0); 888 KMP_DEBUG_ASSERT(level < num_layers); 889 KMP_DEBUG_ASSERT(index >= 0); 890 KMP_DEBUG_ASSERT(index < get_length(level)); 891 return &(layers[level][index]); 892 } 893 // Returns the number of layers in the hierarchy 894 int get_num_layers() const { return num_layers; } 895 // Returns the number of threads in the top layer 896 // This is necessary because we don't store a topology unit as 897 // the very top level and the scheduling algorithms need this information 898 int get_top_level_nproc() const { return top_level_nproc; } 899 // Return whether this hierarchy is valid or not 900 bool is_valid() const { return valid; } 901 #ifdef KMP_DEBUG 902 // Print the hierarchy 903 void print() { 904 KD_TRACE(10, ("kmp_hier_t:\n")); 905 for (int i = num_layers - 1; i >= 0; --i) { 906 KD_TRACE(10, ("Info[%d] = ", i)); 907 info[i].print(); 908 } 909 for (int i = num_layers - 1; i >= 0; --i) { 910 KD_TRACE(10, ("Layer[%d] =\n", i)); 911 for (int j = 0; j < info[i].length; ++j) { 912 layers[i][j].print(); 913 } 914 } 915 } 916 #endif 917 }; 918 919 template <typename T> 920 void __kmp_dispatch_init_hierarchy(ident_t *loc, int n, 921 kmp_hier_layer_e *new_layers, 922 enum sched_type *new_scheds, 923 typename traits_t<T>::signed_t *new_chunks, 924 T lb, T ub, 925 typename traits_t<T>::signed_t st) { 926 int tid, gtid, num_hw_threads, num_threads_per_layer1, active; 927 int my_buffer_index; 928 kmp_info_t *th; 929 kmp_team_t *team; 930 dispatch_private_info_template<T> *pr; 931 dispatch_shared_info_template<T> volatile *sh; 932 gtid = __kmp_entry_gtid(); 933 tid = __kmp_tid_from_gtid(gtid); 934 #ifdef KMP_DEBUG 935 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n", 936 gtid, n)); 937 for (int i = 0; i < n; ++i) { 938 const char *layer = __kmp_get_hier_str(new_layers[i]); 939 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, " 940 "new_scheds[%d] = %d, new_chunks[%d] = %u\n", 941 gtid, i, layer, i, (int)new_scheds[i], i, new_chunks[i])); 942 } 943 #endif // KMP_DEBUG 944 KMP_DEBUG_ASSERT(n > 0); 945 KMP_DEBUG_ASSERT(new_layers); 946 KMP_DEBUG_ASSERT(new_scheds); 947 KMP_DEBUG_ASSERT(new_chunks); 948 if (!TCR_4(__kmp_init_parallel)) 949 __kmp_parallel_initialize(); 950 __kmp_resume_if_soft_paused(); 951 952 th = __kmp_threads[gtid]; 953 team = th->th.th_team; 954 active = !team->t.t_serialized; 955 th->th.th_ident = loc; 956 num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1]; 957 KMP_DEBUG_ASSERT(th->th.th_dispatch == 958 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 959 my_buffer_index = th->th.th_dispatch->th_disp_index; 960 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 961 &th->th.th_dispatch 962 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 963 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 964 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 965 if (!active) { 966 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. " 967 "Using normal dispatch functions.\n", 968 gtid)); 969 KMP_DEBUG_ASSERT(pr); 970 pr->flags.use_hier = FALSE; 971 pr->flags.contains_last = FALSE; 972 return; 973 } 974 KMP_DEBUG_ASSERT(pr); 975 KMP_DEBUG_ASSERT(sh); 976 pr->flags.use_hier = TRUE; 977 pr->u.p.tc = 0; 978 // Have master allocate the hierarchy 979 if (__kmp_tid_from_gtid(gtid) == 0) { 980 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating " 981 "hierarchy\n", 982 gtid, pr, sh)); 983 if (sh->hier == NULL) { 984 sh->hier = (kmp_hier_t<T> *)__kmp_allocate(sizeof(kmp_hier_t<T>)); 985 } 986 sh->hier->allocate_hier(n, new_layers, new_scheds, new_chunks); 987 sh->u.s.iteration = 0; 988 } 989 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 990 // Check to make sure the hierarchy is valid 991 kmp_hier_t<T> *hier = sh->hier; 992 if (!sh->hier->is_valid()) { 993 pr->flags.use_hier = FALSE; 994 return; 995 } 996 // Have threads allocate their thread-private barrier data if it hasn't 997 // already been allocated 998 if (th->th.th_hier_bar_data == NULL) { 999 th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate( 1000 sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST); 1001 } 1002 // Have threads "register" themselves by modifying the active count for each 1003 // level they are involved in. The active count will act as nthreads for that 1004 // level regarding the scheduling algorithms 1005 for (int i = 0; i < n; ++i) { 1006 int index = __kmp_dispatch_get_index(tid, hier->get_type(i)); 1007 kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index); 1008 // Setup the thread's private dispatch buffer's hierarchy pointers 1009 if (i == 0) 1010 pr->hier_parent = my_unit; 1011 // If this unit is already active, then increment active count and wait 1012 if (my_unit->is_active()) { 1013 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) " 1014 "is already active (%d)\n", 1015 gtid, my_unit, my_unit->active)); 1016 KMP_TEST_THEN_INC32(&(my_unit->active)); 1017 break; 1018 } 1019 // Flag that this unit is active 1020 if (KMP_COMPARE_AND_STORE_ACQ32(&(my_unit->active), 0, 1)) { 1021 // Do not setup parent pointer for top level unit since it has no parent 1022 if (i < n - 1) { 1023 // Setup middle layer pointers to parents 1024 my_unit->get_my_pr()->hier_id = 1025 index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i), 1026 hier->get_type(i + 1)); 1027 int parent_index = __kmp_dispatch_get_index(tid, hier->get_type(i + 1)); 1028 my_unit->hier_parent = hier->get_unit(i + 1, parent_index); 1029 } else { 1030 // Setup top layer information (no parent pointers are set) 1031 my_unit->get_my_pr()->hier_id = 1032 index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i), 1033 kmp_hier_layer_e::LAYER_LOOP); 1034 KMP_TEST_THEN_INC32(&(hier->top_level_nproc)); 1035 my_unit->hier_parent = nullptr; 1036 } 1037 // Set trip count to 0 so that next() operation will initially climb up 1038 // the hierarchy to get more iterations (early exit in next() for tc == 0) 1039 my_unit->get_my_pr()->u.p.tc = 0; 1040 // Increment this layer's number of active units 1041 KMP_TEST_THEN_INC32(&(hier->info[i].num_active)); 1042 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) " 1043 "incrementing num_active\n", 1044 gtid, my_unit)); 1045 } else { 1046 KMP_TEST_THEN_INC32(&(my_unit->active)); 1047 break; 1048 } 1049 } 1050 // Set this thread's id 1051 num_threads_per_layer1 = __kmp_dispatch_get_t1_per_t2( 1052 kmp_hier_layer_e::LAYER_THREAD, hier->get_type(0)); 1053 pr->hier_id = tid % num_threads_per_layer1; 1054 // For oversubscribed threads, increment their index within the lowest unit 1055 // This is done to prevent having two or more threads with id 0, id 1, etc. 1056 if (tid >= num_hw_threads) 1057 pr->hier_id += ((tid / num_hw_threads) * num_threads_per_layer1); 1058 KD_TRACE( 1059 10, ("__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n", 1060 gtid, pr->hier_id)); 1061 1062 pr->flags.contains_last = FALSE; 1063 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 1064 1065 // Now that the number of active threads at each level is determined, 1066 // the barrier data for each unit can be initialized and the last layer's 1067 // loop information can be initialized. 1068 int prev_id = pr->get_hier_id(); 1069 for (int i = 0; i < n; ++i) { 1070 if (prev_id != 0) 1071 break; 1072 int index = __kmp_dispatch_get_index(tid, hier->get_type(i)); 1073 kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index); 1074 // Only master threads of this unit within the hierarchy do initialization 1075 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n", 1076 gtid, i)); 1077 my_unit->reset_shared_barrier(); 1078 my_unit->hier_pr.flags.contains_last = FALSE; 1079 // Last layer, initialize the private buffers with entire loop information 1080 // Now the next next_algorithm() call will get the first chunk of 1081 // iterations properly 1082 if (i == n - 1) { 1083 __kmp_dispatch_init_algorithm<T>( 1084 loc, gtid, my_unit->get_my_pr(), hier->get_sched(i), lb, ub, st, 1085 #if USE_ITT_BUILD 1086 NULL, 1087 #endif 1088 hier->get_chunk(i), hier->get_num_active(i), my_unit->get_hier_id()); 1089 } 1090 prev_id = my_unit->get_hier_id(); 1091 } 1092 // Initialize each layer of the thread's private barrier data 1093 kmp_hier_top_unit_t<T> *unit = pr->hier_parent; 1094 for (int i = 0; i < n && unit; ++i, unit = unit->get_parent()) { 1095 kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[i]); 1096 unit->reset_private_barrier(tdata); 1097 } 1098 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 1099 1100 #ifdef KMP_DEBUG 1101 if (__kmp_tid_from_gtid(gtid) == 0) { 1102 for (int i = 0; i < n; ++i) { 1103 KD_TRACE(10, 1104 ("__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n", 1105 gtid, i, hier->get_num_active(i))); 1106 } 1107 hier->print(); 1108 } 1109 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 1110 #endif // KMP_DEBUG 1111 } 1112 #endif 1113