1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 /* Dynamic scheduling initialization and dispatch. 14 * 15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 16 * it may change values between parallel regions. __kmp_max_nth 17 * is the largest value __kmp_nth may take, 1 is the smallest. 18 */ 19 20 #include "kmp.h" 21 #include "kmp_error.h" 22 #include "kmp_i18n.h" 23 #include "kmp_itt.h" 24 #include "kmp_stats.h" 25 #include "kmp_str.h" 26 #if KMP_USE_X87CONTROL 27 #include <float.h> 28 #endif 29 #include "kmp_lock.h" 30 #include "kmp_dispatch.h" 31 #if KMP_USE_HIER_SCHED 32 #include "kmp_dispatch_hier.h" 33 #endif 34 35 #if OMPT_SUPPORT 36 #include "ompt-specific.h" 37 #endif 38 39 /* ------------------------------------------------------------------------ */ 40 /* ------------------------------------------------------------------------ */ 41 42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 43 kmp_info_t *th; 44 45 KMP_DEBUG_ASSERT(gtid_ref); 46 47 if (__kmp_env_consistency_check) { 48 th = __kmp_threads[*gtid_ref]; 49 if (th->th.th_root->r.r_active && 50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 51 #if KMP_USE_DYNAMIC_LOCK 52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 53 #else 54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 55 #endif 56 } 57 } 58 } 59 60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 61 kmp_info_t *th; 62 63 if (__kmp_env_consistency_check) { 64 th = __kmp_threads[*gtid_ref]; 65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 67 } 68 } 69 } 70 71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC 72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule, 73 bool use_hier = false) { 74 // Pick up the nonmonotonic/monotonic bits from the scheduling type 75 // TODO: make nonmonotonic when static_steal is fixed 76 int monotonicity = SCHEDULE_MONOTONIC; 77 78 // Let default be monotonic for executables 79 // compiled with OpenMP* 4.5 or less compilers 80 if (loc != NULL && loc->get_openmp_version() < 50) 81 monotonicity = SCHEDULE_MONOTONIC; 82 83 if (use_hier || __kmp_force_monotonic) 84 monotonicity = SCHEDULE_MONOTONIC; 85 else if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 86 monotonicity = SCHEDULE_NONMONOTONIC; 87 else if (SCHEDULE_HAS_MONOTONIC(schedule)) 88 monotonicity = SCHEDULE_MONOTONIC; 89 90 return monotonicity; 91 } 92 93 #if KMP_STATIC_STEAL_ENABLED 94 enum { // values for steal_flag (possible states of private per-loop buffer) 95 UNUSED = 0, 96 CLAIMED = 1, // owner thread started initialization 97 READY = 2, // available for stealing 98 THIEF = 3 // finished by owner, or claimed by thief 99 // possible state changes: 100 // 0 -> 1 owner only, sync 101 // 0 -> 3 thief only, sync 102 // 1 -> 2 owner only, async 103 // 2 -> 3 owner only, async 104 // 3 -> 2 owner only, async 105 // 3 -> 0 last thread finishing the loop, async 106 }; 107 #endif 108 109 // Initialize a dispatch_private_info_template<T> buffer for a particular 110 // type of schedule,chunk. The loop description is found in lb (lower bound), 111 // ub (upper bound), and st (stride). nproc is the number of threads relevant 112 // to the scheduling (often the number of threads in a team, but not always if 113 // hierarchical scheduling is used). tid is the id of the thread calling 114 // the function within the group of nproc threads. It will have a value 115 // between 0 and nproc - 1. This is often just the thread id within a team, but 116 // is not necessarily the case when using hierarchical scheduling. 117 // loc is the source file location of the corresponding loop 118 // gtid is the global thread id 119 template <typename T> 120 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 121 dispatch_private_info_template<T> *pr, 122 enum sched_type schedule, T lb, T ub, 123 typename traits_t<T>::signed_t st, 124 #if USE_ITT_BUILD 125 kmp_uint64 *cur_chunk, 126 #endif 127 typename traits_t<T>::signed_t chunk, 128 T nproc, T tid) { 129 typedef typename traits_t<T>::unsigned_t UT; 130 typedef typename traits_t<T>::floating_t DBL; 131 132 int active; 133 T tc; 134 kmp_info_t *th; 135 kmp_team_t *team; 136 int monotonicity; 137 bool use_hier; 138 139 #ifdef KMP_DEBUG 140 typedef typename traits_t<T>::signed_t ST; 141 { 142 char *buff; 143 // create format specifiers before the debug output 144 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 145 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 146 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 147 traits_t<T>::spec, traits_t<T>::spec, 148 traits_t<ST>::spec, traits_t<ST>::spec, 149 traits_t<T>::spec, traits_t<T>::spec); 150 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 151 __kmp_str_free(&buff); 152 } 153 #endif 154 /* setup data */ 155 th = __kmp_threads[gtid]; 156 team = th->th.th_team; 157 active = !team->t.t_serialized; 158 159 #if USE_ITT_BUILD 160 int itt_need_metadata_reporting = 161 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 162 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 163 team->t.t_active_level == 1; 164 #endif 165 166 #if KMP_USE_HIER_SCHED 167 use_hier = pr->flags.use_hier; 168 #else 169 use_hier = false; 170 #endif 171 172 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */ 173 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 174 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 175 176 /* Pick up the nomerge/ordered bits from the scheduling type */ 177 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 178 pr->flags.nomerge = TRUE; 179 schedule = 180 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 181 } else { 182 pr->flags.nomerge = FALSE; 183 } 184 pr->type_size = traits_t<T>::type_size; // remember the size of variables 185 if (kmp_ord_lower & schedule) { 186 pr->flags.ordered = TRUE; 187 schedule = 188 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 189 } else { 190 pr->flags.ordered = FALSE; 191 } 192 // Ordered overrides nonmonotonic 193 if (pr->flags.ordered) { 194 monotonicity = SCHEDULE_MONOTONIC; 195 } 196 197 if (schedule == kmp_sch_static) { 198 schedule = __kmp_static; 199 } else { 200 if (schedule == kmp_sch_runtime) { 201 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 202 // not specified) 203 schedule = team->t.t_sched.r_sched_type; 204 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 205 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 206 if (pr->flags.ordered) // correct monotonicity for ordered loop if needed 207 monotonicity = SCHEDULE_MONOTONIC; 208 // Detail the schedule if needed (global controls are differentiated 209 // appropriately) 210 if (schedule == kmp_sch_guided_chunked) { 211 schedule = __kmp_guided; 212 } else if (schedule == kmp_sch_static) { 213 schedule = __kmp_static; 214 } 215 // Use the chunk size specified by OMP_SCHEDULE (or default if not 216 // specified) 217 chunk = team->t.t_sched.chunk; 218 #if USE_ITT_BUILD 219 if (cur_chunk) 220 *cur_chunk = chunk; 221 #endif 222 #ifdef KMP_DEBUG 223 { 224 char *buff; 225 // create format specifiers before the debug output 226 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 227 "schedule:%%d chunk:%%%s\n", 228 traits_t<ST>::spec); 229 KD_TRACE(10, (buff, gtid, schedule, chunk)); 230 __kmp_str_free(&buff); 231 } 232 #endif 233 } else { 234 if (schedule == kmp_sch_guided_chunked) { 235 schedule = __kmp_guided; 236 } 237 if (chunk <= 0) { 238 chunk = KMP_DEFAULT_CHUNK; 239 } 240 } 241 242 if (schedule == kmp_sch_auto) { 243 // mapping and differentiation: in the __kmp_do_serial_initialize() 244 schedule = __kmp_auto; 245 #ifdef KMP_DEBUG 246 { 247 char *buff; 248 // create format specifiers before the debug output 249 buff = __kmp_str_format( 250 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 251 "schedule:%%d chunk:%%%s\n", 252 traits_t<ST>::spec); 253 KD_TRACE(10, (buff, gtid, schedule, chunk)); 254 __kmp_str_free(&buff); 255 } 256 #endif 257 } 258 #if KMP_STATIC_STEAL_ENABLED 259 // map nonmonotonic:dynamic to static steal 260 if (schedule == kmp_sch_dynamic_chunked) { 261 if (monotonicity == SCHEDULE_NONMONOTONIC) 262 schedule = kmp_sch_static_steal; 263 } 264 #endif 265 /* guided analytical not safe for too many threads */ 266 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 267 schedule = kmp_sch_guided_iterative_chunked; 268 KMP_WARNING(DispatchManyThreads); 269 } 270 if (schedule == kmp_sch_runtime_simd) { 271 // compiler provides simd_width in the chunk parameter 272 schedule = team->t.t_sched.r_sched_type; 273 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 274 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 275 // Detail the schedule if needed (global controls are differentiated 276 // appropriately) 277 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 278 schedule == __kmp_static) { 279 schedule = kmp_sch_static_balanced_chunked; 280 } else { 281 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 282 schedule = kmp_sch_guided_simd; 283 } 284 chunk = team->t.t_sched.chunk * chunk; 285 } 286 #if USE_ITT_BUILD 287 if (cur_chunk) 288 *cur_chunk = chunk; 289 #endif 290 #ifdef KMP_DEBUG 291 { 292 char *buff; 293 // create format specifiers before the debug output 294 buff = __kmp_str_format( 295 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d" 296 " chunk:%%%s\n", 297 traits_t<ST>::spec); 298 KD_TRACE(10, (buff, gtid, schedule, chunk)); 299 __kmp_str_free(&buff); 300 } 301 #endif 302 } 303 pr->u.p.parm1 = chunk; 304 } 305 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 306 "unknown scheduling type"); 307 308 pr->u.p.count = 0; 309 310 if (__kmp_env_consistency_check) { 311 if (st == 0) { 312 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 313 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 314 } 315 } 316 // compute trip count 317 if (st == 1) { // most common case 318 if (ub >= lb) { 319 tc = ub - lb + 1; 320 } else { // ub < lb 321 tc = 0; // zero-trip 322 } 323 } else if (st < 0) { 324 if (lb >= ub) { 325 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 326 // where the division needs to be unsigned regardless of the result type 327 tc = (UT)(lb - ub) / (-st) + 1; 328 } else { // lb < ub 329 tc = 0; // zero-trip 330 } 331 } else { // st > 0 332 if (ub >= lb) { 333 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 334 // where the division needs to be unsigned regardless of the result type 335 tc = (UT)(ub - lb) / st + 1; 336 } else { // ub < lb 337 tc = 0; // zero-trip 338 } 339 } 340 341 #if KMP_STATS_ENABLED 342 if (KMP_MASTER_GTID(gtid)) { 343 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc); 344 } 345 #endif 346 347 pr->u.p.lb = lb; 348 pr->u.p.ub = ub; 349 pr->u.p.st = st; 350 pr->u.p.tc = tc; 351 352 #if KMP_OS_WINDOWS 353 pr->u.p.last_upper = ub + st; 354 #endif /* KMP_OS_WINDOWS */ 355 356 /* NOTE: only the active parallel region(s) has active ordered sections */ 357 358 if (active) { 359 if (pr->flags.ordered) { 360 pr->ordered_bumped = 0; 361 pr->u.p.ordered_lower = 1; 362 pr->u.p.ordered_upper = 0; 363 } 364 } 365 366 switch (schedule) { 367 #if KMP_STATIC_STEAL_ENABLED 368 case kmp_sch_static_steal: { 369 T ntc, init; 370 371 KD_TRACE(100, 372 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 373 gtid)); 374 375 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 376 if (nproc > 1 && ntc >= nproc) { 377 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 378 T id = tid; 379 T small_chunk, extras; 380 kmp_uint32 old = UNUSED; 381 int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED); 382 if (traits_t<T>::type_size > 4) { 383 // AC: TODO: check if 16-byte CAS available and use it to 384 // improve performance (probably wait for explicit request 385 // before spending time on this). 386 // For now use dynamically allocated per-private-buffer lock, 387 // free memory in __kmp_dispatch_next when status==0. 388 pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 389 __kmp_init_lock(pr->u.p.steal_lock); 390 } 391 small_chunk = ntc / nproc; 392 extras = ntc % nproc; 393 394 init = id * small_chunk + (id < extras ? id : extras); 395 pr->u.p.count = init; 396 if (claimed) { // are we succeeded in claiming own buffer? 397 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 398 // Other threads will inspect steal_flag when searching for a victim. 399 // READY means other threads may steal from this thread from now on. 400 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 401 } else { 402 // other thread has stolen whole our range 403 KMP_DEBUG_ASSERT(pr->steal_flag == THIEF); 404 pr->u.p.ub = init; // mark there is no iterations to work on 405 } 406 pr->u.p.parm2 = ntc; // save number of chunks 407 // parm3 is the number of times to attempt stealing which is 408 // nproc (just a heuristics, could be optimized later on). 409 pr->u.p.parm3 = nproc; 410 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 411 break; 412 } else { 413 /* too few chunks: switching to kmp_sch_dynamic_chunked */ 414 schedule = kmp_sch_dynamic_chunked; 415 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to " 416 "kmp_sch_dynamic_chunked\n", 417 gtid)); 418 goto dynamic_init; 419 break; 420 } // if 421 } // case 422 #endif 423 case kmp_sch_static_balanced: { 424 T init, limit; 425 426 KD_TRACE( 427 100, 428 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 429 gtid)); 430 431 if (nproc > 1) { 432 T id = tid; 433 434 if (tc < nproc) { 435 if (id < tc) { 436 init = id; 437 limit = id; 438 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 439 } else { 440 pr->u.p.count = 1; /* means no more chunks to execute */ 441 pr->u.p.parm1 = FALSE; 442 break; 443 } 444 } else { 445 T small_chunk = tc / nproc; 446 T extras = tc % nproc; 447 init = id * small_chunk + (id < extras ? id : extras); 448 limit = init + small_chunk - (id < extras ? 0 : 1); 449 pr->u.p.parm1 = (id == nproc - 1); 450 } 451 } else { 452 if (tc > 0) { 453 init = 0; 454 limit = tc - 1; 455 pr->u.p.parm1 = TRUE; 456 } else { 457 // zero trip count 458 pr->u.p.count = 1; /* means no more chunks to execute */ 459 pr->u.p.parm1 = FALSE; 460 break; 461 } 462 } 463 #if USE_ITT_BUILD 464 // Calculate chunk for metadata report 465 if (itt_need_metadata_reporting) 466 if (cur_chunk) 467 *cur_chunk = limit - init + 1; 468 #endif 469 if (st == 1) { 470 pr->u.p.lb = lb + init; 471 pr->u.p.ub = lb + limit; 472 } else { 473 // calculated upper bound, "ub" is user-defined upper bound 474 T ub_tmp = lb + limit * st; 475 pr->u.p.lb = lb + init * st; 476 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 477 // it exactly 478 if (st > 0) { 479 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 480 } else { 481 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 482 } 483 } 484 if (pr->flags.ordered) { 485 pr->u.p.ordered_lower = init; 486 pr->u.p.ordered_upper = limit; 487 } 488 break; 489 } // case 490 case kmp_sch_static_balanced_chunked: { 491 // similar to balanced, but chunk adjusted to multiple of simd width 492 T nth = nproc; 493 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 494 " -> falling-through to static_greedy\n", 495 gtid)); 496 schedule = kmp_sch_static_greedy; 497 if (nth > 1) 498 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 499 else 500 pr->u.p.parm1 = tc; 501 break; 502 } // case 503 case kmp_sch_guided_simd: 504 case kmp_sch_guided_iterative_chunked: { 505 KD_TRACE( 506 100, 507 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 508 " case\n", 509 gtid)); 510 511 if (nproc > 1) { 512 if ((2L * chunk + 1) * nproc >= tc) { 513 /* chunk size too large, switch to dynamic */ 514 schedule = kmp_sch_dynamic_chunked; 515 goto dynamic_init; 516 } else { 517 // when remaining iters become less than parm2 - switch to dynamic 518 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 519 *(double *)&pr->u.p.parm3 = 520 guided_flt_param / (double)nproc; // may occupy parm3 and parm4 521 } 522 } else { 523 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 524 "kmp_sch_static_greedy\n", 525 gtid)); 526 schedule = kmp_sch_static_greedy; 527 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 528 KD_TRACE( 529 100, 530 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 531 gtid)); 532 pr->u.p.parm1 = tc; 533 } // if 534 } // case 535 break; 536 case kmp_sch_guided_analytical_chunked: { 537 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 538 "kmp_sch_guided_analytical_chunked case\n", 539 gtid)); 540 541 if (nproc > 1) { 542 if ((2L * chunk + 1) * nproc >= tc) { 543 /* chunk size too large, switch to dynamic */ 544 schedule = kmp_sch_dynamic_chunked; 545 goto dynamic_init; 546 } else { 547 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 548 DBL x; 549 550 #if KMP_USE_X87CONTROL 551 /* Linux* OS already has 64-bit computation by default for long double, 552 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 553 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 554 instead of the default 53-bit. Even though long double doesn't work 555 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 556 expected to impact the correctness of the algorithm, but this has not 557 been mathematically proven. */ 558 // save original FPCW and set precision to 64-bit, as 559 // Windows* OS on IA-32 architecture defaults to 53-bit 560 unsigned int oldFpcw = _control87(0, 0); 561 _control87(_PC_64, _MCW_PC); // 0,0x30000 562 #endif 563 /* value used for comparison in solver for cross-over point */ 564 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 565 566 /* crossover point--chunk indexes equal to or greater than 567 this point switch to dynamic-style scheduling */ 568 UT cross; 569 570 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 571 x = 1.0 - 0.5 / (double)nproc; 572 573 #ifdef KMP_DEBUG 574 { // test natural alignment 575 struct _test_a { 576 char a; 577 union { 578 char b; 579 DBL d; 580 }; 581 } t; 582 ptrdiff_t natural_alignment = 583 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 584 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 585 // long)natural_alignment ); 586 KMP_DEBUG_ASSERT( 587 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 588 } 589 #endif // KMP_DEBUG 590 591 /* save the term in thread private dispatch structure */ 592 *(DBL *)&pr->u.p.parm3 = x; 593 594 /* solve for the crossover point to the nearest integer i for which C_i 595 <= chunk */ 596 { 597 UT left, right, mid; 598 long double p; 599 600 /* estimate initial upper and lower bound */ 601 602 /* doesn't matter what value right is as long as it is positive, but 603 it affects performance of the solver */ 604 right = 229; 605 p = __kmp_pow<UT>(x, right); 606 if (p > target) { 607 do { 608 p *= p; 609 right <<= 1; 610 } while (p > target && right < (1 << 27)); 611 /* lower bound is previous (failed) estimate of upper bound */ 612 left = right >> 1; 613 } else { 614 left = 0; 615 } 616 617 /* bisection root-finding method */ 618 while (left + 1 < right) { 619 mid = (left + right) / 2; 620 if (__kmp_pow<UT>(x, mid) > target) { 621 left = mid; 622 } else { 623 right = mid; 624 } 625 } // while 626 cross = right; 627 } 628 /* assert sanity of computed crossover point */ 629 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 630 __kmp_pow<UT>(x, cross) <= target); 631 632 /* save the crossover point in thread private dispatch structure */ 633 pr->u.p.parm2 = cross; 634 635 // C75803 636 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 637 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 638 #else 639 #define GUIDED_ANALYTICAL_WORKAROUND (x) 640 #endif 641 /* dynamic-style scheduling offset */ 642 pr->u.p.count = tc - 643 __kmp_dispatch_guided_remaining( 644 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 645 cross * chunk; 646 #if KMP_USE_X87CONTROL 647 // restore FPCW 648 _control87(oldFpcw, _MCW_PC); 649 #endif 650 } // if 651 } else { 652 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 653 "kmp_sch_static_greedy\n", 654 gtid)); 655 schedule = kmp_sch_static_greedy; 656 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 657 pr->u.p.parm1 = tc; 658 } // if 659 } // case 660 break; 661 case kmp_sch_static_greedy: 662 KD_TRACE( 663 100, 664 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 665 gtid)); 666 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 667 break; 668 case kmp_sch_static_chunked: 669 case kmp_sch_dynamic_chunked: 670 dynamic_init: 671 if (pr->u.p.parm1 <= 0) 672 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 673 else if (pr->u.p.parm1 > tc) 674 pr->u.p.parm1 = tc; 675 // Store the total number of chunks to prevent integer overflow during 676 // bounds calculations in the get next chunk routine. 677 pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0); 678 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 679 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 680 gtid)); 681 break; 682 case kmp_sch_trapezoidal: { 683 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 684 685 T parm1, parm2, parm3, parm4; 686 KD_TRACE(100, 687 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 688 gtid)); 689 690 parm1 = chunk; 691 692 /* F : size of the first cycle */ 693 parm2 = (tc / (2 * nproc)); 694 695 if (parm2 < 1) { 696 parm2 = 1; 697 } 698 699 /* L : size of the last cycle. Make sure the last cycle is not larger 700 than the first cycle. */ 701 if (parm1 < 1) { 702 parm1 = 1; 703 } else if (parm1 > parm2) { 704 parm1 = parm2; 705 } 706 707 /* N : number of cycles */ 708 parm3 = (parm2 + parm1); 709 parm3 = (2 * tc + parm3 - 1) / parm3; 710 711 if (parm3 < 2) { 712 parm3 = 2; 713 } 714 715 /* sigma : decreasing incr of the trapezoid */ 716 parm4 = (parm3 - 1); 717 parm4 = (parm2 - parm1) / parm4; 718 719 // pointless check, because parm4 >= 0 always 720 // if ( parm4 < 0 ) { 721 // parm4 = 0; 722 //} 723 724 pr->u.p.parm1 = parm1; 725 pr->u.p.parm2 = parm2; 726 pr->u.p.parm3 = parm3; 727 pr->u.p.parm4 = parm4; 728 } // case 729 break; 730 731 default: { 732 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 733 KMP_HNT(GetNewerLibrary), // Hint 734 __kmp_msg_null // Variadic argument list terminator 735 ); 736 } break; 737 } // switch 738 pr->schedule = schedule; 739 } 740 741 #if KMP_USE_HIER_SCHED 742 template <typename T> 743 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 744 typename traits_t<T>::signed_t st); 745 template <> 746 inline void 747 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 748 kmp_int32 ub, kmp_int32 st) { 749 __kmp_dispatch_init_hierarchy<kmp_int32>( 750 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 751 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 752 } 753 template <> 754 inline void 755 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 756 kmp_uint32 ub, kmp_int32 st) { 757 __kmp_dispatch_init_hierarchy<kmp_uint32>( 758 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 759 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 760 } 761 template <> 762 inline void 763 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 764 kmp_int64 ub, kmp_int64 st) { 765 __kmp_dispatch_init_hierarchy<kmp_int64>( 766 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 767 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 768 } 769 template <> 770 inline void 771 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 772 kmp_uint64 ub, kmp_int64 st) { 773 __kmp_dispatch_init_hierarchy<kmp_uint64>( 774 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 775 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 776 } 777 778 // free all the hierarchy scheduling memory associated with the team 779 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 780 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 781 for (int i = 0; i < num_disp_buff; ++i) { 782 // type does not matter here so use kmp_int32 783 auto sh = 784 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 785 &team->t.t_disp_buffer[i]); 786 if (sh->hier) { 787 sh->hier->deallocate(); 788 __kmp_free(sh->hier); 789 } 790 } 791 } 792 #endif 793 794 // UT - unsigned flavor of T, ST - signed flavor of T, 795 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 796 template <typename T> 797 static void 798 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 799 T ub, typename traits_t<T>::signed_t st, 800 typename traits_t<T>::signed_t chunk, int push_ws) { 801 typedef typename traits_t<T>::unsigned_t UT; 802 803 int active; 804 kmp_info_t *th; 805 kmp_team_t *team; 806 kmp_uint32 my_buffer_index; 807 dispatch_private_info_template<T> *pr; 808 dispatch_shared_info_template<T> volatile *sh; 809 810 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 811 sizeof(dispatch_private_info)); 812 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 813 sizeof(dispatch_shared_info)); 814 __kmp_assert_valid_gtid(gtid); 815 816 if (!TCR_4(__kmp_init_parallel)) 817 __kmp_parallel_initialize(); 818 819 __kmp_resume_if_soft_paused(); 820 821 #if INCLUDE_SSC_MARKS 822 SSC_MARK_DISPATCH_INIT(); 823 #endif 824 #ifdef KMP_DEBUG 825 typedef typename traits_t<T>::signed_t ST; 826 { 827 char *buff; 828 // create format specifiers before the debug output 829 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 830 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 831 traits_t<ST>::spec, traits_t<T>::spec, 832 traits_t<T>::spec, traits_t<ST>::spec); 833 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 834 __kmp_str_free(&buff); 835 } 836 #endif 837 /* setup data */ 838 th = __kmp_threads[gtid]; 839 team = th->th.th_team; 840 active = !team->t.t_serialized; 841 th->th.th_ident = loc; 842 843 // Any half-decent optimizer will remove this test when the blocks are empty 844 // since the macros expand to nothing 845 // when statistics are disabled. 846 if (schedule == __kmp_static) { 847 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 848 } else { 849 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 850 } 851 852 #if KMP_USE_HIER_SCHED 853 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 854 // Hierarchical scheduling does not work with ordered, so if ordered is 855 // detected, then revert back to threaded scheduling. 856 bool ordered; 857 enum sched_type my_sched = schedule; 858 my_buffer_index = th->th.th_dispatch->th_disp_index; 859 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 860 &th->th.th_dispatch 861 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 862 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 863 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 864 my_sched = 865 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 866 ordered = (kmp_ord_lower & my_sched); 867 if (pr->flags.use_hier) { 868 if (ordered) { 869 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 870 "Disabling hierarchical scheduling.\n", 871 gtid)); 872 pr->flags.use_hier = FALSE; 873 } 874 } 875 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 876 // Don't use hierarchical for ordered parallel loops and don't 877 // use the runtime hierarchy if one was specified in the program 878 if (!ordered && !pr->flags.use_hier) 879 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 880 } 881 #endif // KMP_USE_HIER_SCHED 882 883 #if USE_ITT_BUILD 884 kmp_uint64 cur_chunk = chunk; 885 int itt_need_metadata_reporting = 886 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 887 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 888 team->t.t_active_level == 1; 889 #endif 890 if (!active) { 891 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 892 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 893 } else { 894 KMP_DEBUG_ASSERT(th->th.th_dispatch == 895 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 896 897 my_buffer_index = th->th.th_dispatch->th_disp_index++; 898 899 /* What happens when number of threads changes, need to resize buffer? */ 900 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 901 &th->th.th_dispatch 902 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 903 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 904 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 905 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 906 my_buffer_index)); 907 if (sh->buffer_index != my_buffer_index) { // too many loops in progress? 908 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d" 909 " sh->buffer_index:%d\n", 910 gtid, my_buffer_index, sh->buffer_index)); 911 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, 912 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 913 // Note: KMP_WAIT() cannot be used there: buffer index and 914 // my_buffer_index are *always* 32-bit integers. 915 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 916 "sh->buffer_index:%d\n", 917 gtid, my_buffer_index, sh->buffer_index)); 918 } 919 } 920 921 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 922 #if USE_ITT_BUILD 923 &cur_chunk, 924 #endif 925 chunk, (T)th->th.th_team_nproc, 926 (T)th->th.th_info.ds.ds_tid); 927 if (active) { 928 if (pr->flags.ordered == 0) { 929 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 930 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 931 } else { 932 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 933 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 934 } 935 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 936 th->th.th_dispatch->th_dispatch_sh_current = 937 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 938 #if USE_ITT_BUILD 939 if (pr->flags.ordered) { 940 __kmp_itt_ordered_init(gtid); 941 } 942 // Report loop metadata 943 if (itt_need_metadata_reporting) { 944 // Only report metadata by primary thread of active team at level 1 945 kmp_uint64 schedtype = 0; 946 switch (schedule) { 947 case kmp_sch_static_chunked: 948 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 949 break; 950 case kmp_sch_static_greedy: 951 cur_chunk = pr->u.p.parm1; 952 break; 953 case kmp_sch_dynamic_chunked: 954 schedtype = 1; 955 break; 956 case kmp_sch_guided_iterative_chunked: 957 case kmp_sch_guided_analytical_chunked: 958 case kmp_sch_guided_simd: 959 schedtype = 2; 960 break; 961 default: 962 // Should we put this case under "static"? 963 // case kmp_sch_static_steal: 964 schedtype = 3; 965 break; 966 } 967 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 968 } 969 #if KMP_USE_HIER_SCHED 970 if (pr->flags.use_hier) { 971 pr->u.p.count = 0; 972 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 973 } 974 #endif // KMP_USER_HIER_SCHED 975 #endif /* USE_ITT_BUILD */ 976 } 977 978 #ifdef KMP_DEBUG 979 { 980 char *buff; 981 // create format specifiers before the debug output 982 buff = __kmp_str_format( 983 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 984 "lb:%%%s ub:%%%s" 985 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 986 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 987 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 988 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 989 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 990 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 991 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 992 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 993 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 994 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 995 __kmp_str_free(&buff); 996 } 997 #endif 998 #if OMPT_SUPPORT && OMPT_OPTIONAL 999 if (ompt_enabled.ompt_callback_work) { 1000 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1001 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 1002 ompt_callbacks.ompt_callback(ompt_callback_work)( 1003 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 1004 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 1005 } 1006 #endif 1007 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 1008 } 1009 1010 /* For ordered loops, either __kmp_dispatch_finish() should be called after 1011 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1012 * every chunk of iterations. If the ordered section(s) were not executed 1013 * for this iteration (or every iteration in this chunk), we need to set the 1014 * ordered iteration counters so that the next thread can proceed. */ 1015 template <typename UT> 1016 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 1017 typedef typename traits_t<UT>::signed_t ST; 1018 __kmp_assert_valid_gtid(gtid); 1019 kmp_info_t *th = __kmp_threads[gtid]; 1020 1021 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1022 if (!th->th.th_team->t.t_serialized) { 1023 1024 dispatch_private_info_template<UT> *pr = 1025 reinterpret_cast<dispatch_private_info_template<UT> *>( 1026 th->th.th_dispatch->th_dispatch_pr_current); 1027 dispatch_shared_info_template<UT> volatile *sh = 1028 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1029 th->th.th_dispatch->th_dispatch_sh_current); 1030 KMP_DEBUG_ASSERT(pr); 1031 KMP_DEBUG_ASSERT(sh); 1032 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1033 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1034 1035 if (pr->ordered_bumped) { 1036 KD_TRACE( 1037 1000, 1038 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1039 gtid)); 1040 pr->ordered_bumped = 0; 1041 } else { 1042 UT lower = pr->u.p.ordered_lower; 1043 1044 #ifdef KMP_DEBUG 1045 { 1046 char *buff; 1047 // create format specifiers before the debug output 1048 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1049 "ordered_iteration:%%%s lower:%%%s\n", 1050 traits_t<UT>::spec, traits_t<UT>::spec); 1051 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1052 __kmp_str_free(&buff); 1053 } 1054 #endif 1055 1056 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1057 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1058 KMP_MB(); /* is this necessary? */ 1059 #ifdef KMP_DEBUG 1060 { 1061 char *buff; 1062 // create format specifiers before the debug output 1063 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1064 "ordered_iteration:%%%s lower:%%%s\n", 1065 traits_t<UT>::spec, traits_t<UT>::spec); 1066 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1067 __kmp_str_free(&buff); 1068 } 1069 #endif 1070 1071 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1072 } // if 1073 } // if 1074 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1075 } 1076 1077 #ifdef KMP_GOMP_COMPAT 1078 1079 template <typename UT> 1080 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1081 typedef typename traits_t<UT>::signed_t ST; 1082 __kmp_assert_valid_gtid(gtid); 1083 kmp_info_t *th = __kmp_threads[gtid]; 1084 1085 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1086 if (!th->th.th_team->t.t_serialized) { 1087 dispatch_private_info_template<UT> *pr = 1088 reinterpret_cast<dispatch_private_info_template<UT> *>( 1089 th->th.th_dispatch->th_dispatch_pr_current); 1090 dispatch_shared_info_template<UT> volatile *sh = 1091 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1092 th->th.th_dispatch->th_dispatch_sh_current); 1093 KMP_DEBUG_ASSERT(pr); 1094 KMP_DEBUG_ASSERT(sh); 1095 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1096 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1097 1098 UT lower = pr->u.p.ordered_lower; 1099 UT upper = pr->u.p.ordered_upper; 1100 UT inc = upper - lower + 1; 1101 1102 if (pr->ordered_bumped == inc) { 1103 KD_TRACE( 1104 1000, 1105 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1106 gtid)); 1107 pr->ordered_bumped = 0; 1108 } else { 1109 inc -= pr->ordered_bumped; 1110 1111 #ifdef KMP_DEBUG 1112 { 1113 char *buff; 1114 // create format specifiers before the debug output 1115 buff = __kmp_str_format( 1116 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1117 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1118 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1119 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1120 __kmp_str_free(&buff); 1121 } 1122 #endif 1123 1124 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1125 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1126 1127 KMP_MB(); /* is this necessary? */ 1128 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1129 "ordered_bumped to zero\n", 1130 gtid)); 1131 pr->ordered_bumped = 0; 1132 //!!!!! TODO check if the inc should be unsigned, or signed??? 1133 #ifdef KMP_DEBUG 1134 { 1135 char *buff; 1136 // create format specifiers before the debug output 1137 buff = __kmp_str_format( 1138 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1139 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1140 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1141 traits_t<UT>::spec); 1142 KD_TRACE(1000, 1143 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1144 __kmp_str_free(&buff); 1145 } 1146 #endif 1147 1148 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1149 } 1150 // } 1151 } 1152 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1153 } 1154 1155 #endif /* KMP_GOMP_COMPAT */ 1156 1157 template <typename T> 1158 int __kmp_dispatch_next_algorithm(int gtid, 1159 dispatch_private_info_template<T> *pr, 1160 dispatch_shared_info_template<T> volatile *sh, 1161 kmp_int32 *p_last, T *p_lb, T *p_ub, 1162 typename traits_t<T>::signed_t *p_st, T nproc, 1163 T tid) { 1164 typedef typename traits_t<T>::unsigned_t UT; 1165 typedef typename traits_t<T>::signed_t ST; 1166 typedef typename traits_t<T>::floating_t DBL; 1167 int status = 0; 1168 bool last = false; 1169 T start; 1170 ST incr; 1171 UT limit, trip, init; 1172 kmp_info_t *th = __kmp_threads[gtid]; 1173 kmp_team_t *team = th->th.th_team; 1174 1175 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1176 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1177 KMP_DEBUG_ASSERT(pr); 1178 KMP_DEBUG_ASSERT(sh); 1179 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1180 #ifdef KMP_DEBUG 1181 { 1182 char *buff; 1183 // create format specifiers before the debug output 1184 buff = 1185 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1186 "sh:%%p nproc:%%%s tid:%%%s\n", 1187 traits_t<T>::spec, traits_t<T>::spec); 1188 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1189 __kmp_str_free(&buff); 1190 } 1191 #endif 1192 1193 // zero trip count 1194 if (pr->u.p.tc == 0) { 1195 KD_TRACE(10, 1196 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1197 "zero status:%d\n", 1198 gtid, status)); 1199 return 0; 1200 } 1201 1202 switch (pr->schedule) { 1203 #if KMP_STATIC_STEAL_ENABLED 1204 case kmp_sch_static_steal: { 1205 T chunk = pr->u.p.parm1; 1206 UT nchunks = pr->u.p.parm2; 1207 KD_TRACE(100, 1208 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1209 gtid)); 1210 1211 trip = pr->u.p.tc - 1; 1212 1213 if (traits_t<T>::type_size > 4) { 1214 // use lock for 8-byte induction variable. 1215 // TODO (optional): check presence and use 16-byte CAS 1216 kmp_lock_t *lck = pr->u.p.steal_lock; 1217 KMP_DEBUG_ASSERT(lck != NULL); 1218 if (pr->u.p.count < (UT)pr->u.p.ub) { 1219 KMP_DEBUG_ASSERT(pr->steal_flag == READY); 1220 __kmp_acquire_lock(lck, gtid); 1221 // try to get own chunk of iterations 1222 init = (pr->u.p.count)++; 1223 status = (init < (UT)pr->u.p.ub); 1224 __kmp_release_lock(lck, gtid); 1225 } else { 1226 status = 0; // no own chunks 1227 } 1228 if (!status) { // try to steal 1229 kmp_lock_t *lckv; // victim buffer's lock 1230 T while_limit = pr->u.p.parm3; 1231 T while_index = 0; 1232 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1233 __kmp_dispatch_num_buffers; // current loop index 1234 // note: victim thread can potentially execute another loop 1235 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive 1236 while ((!status) && (while_limit != ++while_index)) { 1237 dispatch_private_info_template<T> *v; 1238 T remaining; 1239 T victimId = pr->u.p.parm4; 1240 T oldVictimId = victimId ? victimId - 1 : nproc - 1; 1241 v = reinterpret_cast<dispatch_private_info_template<T> *>( 1242 &team->t.t_dispatch[victimId].th_disp_buffer[idx]); 1243 KMP_DEBUG_ASSERT(v); 1244 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) && 1245 oldVictimId != victimId) { 1246 victimId = (victimId + 1) % nproc; 1247 v = reinterpret_cast<dispatch_private_info_template<T> *>( 1248 &team->t.t_dispatch[victimId].th_disp_buffer[idx]); 1249 KMP_DEBUG_ASSERT(v); 1250 } 1251 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) { 1252 continue; // try once more (nproc attempts in total) 1253 } 1254 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) { 1255 kmp_uint32 old = UNUSED; 1256 // try to steal whole range from inactive victim 1257 status = v->steal_flag.compare_exchange_strong(old, THIEF); 1258 if (status) { 1259 // initialize self buffer with victim's whole range of chunks 1260 T id = victimId; 1261 T small_chunk, extras; 1262 small_chunk = nchunks / nproc; // chunks per thread 1263 extras = nchunks % nproc; 1264 init = id * small_chunk + (id < extras ? id : extras); 1265 __kmp_acquire_lock(lck, gtid); 1266 pr->u.p.count = init + 1; // exclude one we execute immediately 1267 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 1268 __kmp_release_lock(lck, gtid); 1269 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 1270 // no need to reinitialize other thread invariants: lb, st, etc. 1271 #ifdef KMP_DEBUG 1272 { 1273 char *buff; 1274 // create format specifiers before the debug output 1275 buff = __kmp_str_format( 1276 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " 1277 "count:%%%s ub:%%%s\n", 1278 traits_t<UT>::spec, traits_t<T>::spec); 1279 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub)); 1280 __kmp_str_free(&buff); 1281 } 1282 #endif 1283 // activate non-empty buffer and let others steal from us 1284 if (pr->u.p.count < (UT)pr->u.p.ub) 1285 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 1286 break; 1287 } 1288 } 1289 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY || 1290 v->u.p.count >= (UT)v->u.p.ub) { 1291 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid 1292 continue; // no chunks to steal, try next victim 1293 } 1294 lckv = v->u.p.steal_lock; 1295 KMP_ASSERT(lckv != NULL); 1296 __kmp_acquire_lock(lckv, gtid); 1297 limit = v->u.p.ub; // keep initial ub 1298 if (v->u.p.count >= limit) { 1299 __kmp_release_lock(lckv, gtid); 1300 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid 1301 continue; // no chunks to steal, try next victim 1302 } 1303 1304 // stealing succeded, reduce victim's ub by 1/4 of undone chunks 1305 // TODO: is this heuristics good enough?? 1306 remaining = limit - v->u.p.count; 1307 if (remaining > 7) { 1308 // steal 1/4 of remaining 1309 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1310 init = (v->u.p.ub -= (remaining >> 2)); 1311 } else { 1312 // steal 1 chunk of 1..7 remaining 1313 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1314 init = (v->u.p.ub -= 1); 1315 } 1316 __kmp_release_lock(lckv, gtid); 1317 #ifdef KMP_DEBUG 1318 { 1319 char *buff; 1320 // create format specifiers before the debug output 1321 buff = __kmp_str_format( 1322 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " 1323 "count:%%%s ub:%%%s\n", 1324 traits_t<UT>::spec, traits_t<UT>::spec); 1325 KD_TRACE(10, (buff, gtid, victimId, init, limit)); 1326 __kmp_str_free(&buff); 1327 } 1328 #endif 1329 KMP_DEBUG_ASSERT(init + 1 <= limit); 1330 pr->u.p.parm4 = victimId; // remember victim to steal from 1331 status = 1; 1332 // now update own count and ub with stolen range excluding init chunk 1333 __kmp_acquire_lock(lck, gtid); 1334 pr->u.p.count = init + 1; 1335 pr->u.p.ub = limit; 1336 __kmp_release_lock(lck, gtid); 1337 // activate non-empty buffer and let others steal from us 1338 if (init + 1 < limit) 1339 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 1340 } // while (search for victim) 1341 } // if (try to find victim and steal) 1342 } else { 1343 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1344 // as all operations on pair (count, ub) must be done atomically 1345 typedef union { 1346 struct { 1347 UT count; 1348 T ub; 1349 } p; 1350 kmp_int64 b; 1351 } union_i4; 1352 union_i4 vold, vnew; 1353 if (pr->u.p.count < (UT)pr->u.p.ub) { 1354 KMP_DEBUG_ASSERT(pr->steal_flag == READY); 1355 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1356 vnew.b = vold.b; 1357 vnew.p.count++; // get chunk from head of self range 1358 while (!KMP_COMPARE_AND_STORE_REL64( 1359 (volatile kmp_int64 *)&pr->u.p.count, 1360 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1361 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1362 KMP_CPU_PAUSE(); 1363 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1364 vnew.b = vold.b; 1365 vnew.p.count++; 1366 } 1367 init = vold.p.count; 1368 status = (init < (UT)vold.p.ub); 1369 } else { 1370 status = 0; // no own chunks 1371 } 1372 if (!status) { // try to steal 1373 T while_limit = pr->u.p.parm3; 1374 T while_index = 0; 1375 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1376 __kmp_dispatch_num_buffers; // current loop index 1377 // note: victim thread can potentially execute another loop 1378 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive 1379 while ((!status) && (while_limit != ++while_index)) { 1380 dispatch_private_info_template<T> *v; 1381 T remaining; 1382 T victimId = pr->u.p.parm4; 1383 T oldVictimId = victimId ? victimId - 1 : nproc - 1; 1384 v = reinterpret_cast<dispatch_private_info_template<T> *>( 1385 &team->t.t_dispatch[victimId].th_disp_buffer[idx]); 1386 KMP_DEBUG_ASSERT(v); 1387 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) && 1388 oldVictimId != victimId) { 1389 victimId = (victimId + 1) % nproc; 1390 v = reinterpret_cast<dispatch_private_info_template<T> *>( 1391 &team->t.t_dispatch[victimId].th_disp_buffer[idx]); 1392 KMP_DEBUG_ASSERT(v); 1393 } 1394 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) { 1395 continue; // try once more (nproc attempts in total) 1396 } 1397 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) { 1398 kmp_uint32 old = UNUSED; 1399 // try to steal whole range from inactive victim 1400 status = v->steal_flag.compare_exchange_strong(old, THIEF); 1401 if (status) { 1402 // initialize self buffer with victim's whole range of chunks 1403 T id = victimId; 1404 T small_chunk, extras; 1405 small_chunk = nchunks / nproc; // chunks per thread 1406 extras = nchunks % nproc; 1407 init = id * small_chunk + (id < extras ? id : extras); 1408 vnew.p.count = init + 1; 1409 vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0); 1410 // write pair (count, ub) at once atomically 1411 #if KMP_ARCH_X86 1412 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b); 1413 #else 1414 *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b; 1415 #endif 1416 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 1417 // no need to initialize other thread invariants: lb, st, etc. 1418 #ifdef KMP_DEBUG 1419 { 1420 char *buff; 1421 // create format specifiers before the debug output 1422 buff = __kmp_str_format( 1423 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " 1424 "count:%%%s ub:%%%s\n", 1425 traits_t<UT>::spec, traits_t<T>::spec); 1426 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub)); 1427 __kmp_str_free(&buff); 1428 } 1429 #endif 1430 // activate non-empty buffer and let others steal from us 1431 if (pr->u.p.count < (UT)pr->u.p.ub) 1432 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 1433 break; 1434 } 1435 } 1436 while (1) { // CAS loop with check if victim still has enough chunks 1437 // many threads may be stealing concurrently from same victim 1438 vold.b = *(volatile kmp_int64 *)(&v->u.p.count); 1439 if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY || 1440 vold.p.count >= (UT)vold.p.ub) { 1441 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id 1442 break; // no chunks to steal, try next victim 1443 } 1444 vnew.b = vold.b; 1445 remaining = vold.p.ub - vold.p.count; 1446 // try to steal 1/4 of remaining 1447 // TODO: is this heuristics good enough?? 1448 if (remaining > 7) { 1449 vnew.p.ub -= remaining >> 2; // steal from tail of victim's range 1450 } else { 1451 vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining 1452 } 1453 KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip); 1454 if (KMP_COMPARE_AND_STORE_REL64( 1455 (volatile kmp_int64 *)&v->u.p.count, 1456 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1457 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1458 // stealing succedded 1459 #ifdef KMP_DEBUG 1460 { 1461 char *buff; 1462 // create format specifiers before the debug output 1463 buff = __kmp_str_format( 1464 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " 1465 "count:%%%s ub:%%%s\n", 1466 traits_t<T>::spec, traits_t<T>::spec); 1467 KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub)); 1468 __kmp_str_free(&buff); 1469 } 1470 #endif 1471 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1472 vold.p.ub - vnew.p.ub); 1473 status = 1; 1474 pr->u.p.parm4 = victimId; // keep victim id 1475 // now update own count and ub 1476 init = vnew.p.ub; 1477 vold.p.count = init + 1; 1478 #if KMP_ARCH_X86 1479 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1480 #else 1481 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1482 #endif 1483 // activate non-empty buffer and let others steal from us 1484 if (vold.p.count < (UT)vold.p.ub) 1485 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 1486 break; 1487 } // if (check CAS result) 1488 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt 1489 } // while (try to steal from particular victim) 1490 } // while (search for victim) 1491 } // if (try to find victim and steal) 1492 } // if (4-byte induction variable) 1493 if (!status) { 1494 *p_lb = 0; 1495 *p_ub = 0; 1496 if (p_st != NULL) 1497 *p_st = 0; 1498 } else { 1499 start = pr->u.p.lb; 1500 init *= chunk; 1501 limit = chunk + init - 1; 1502 incr = pr->u.p.st; 1503 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1504 1505 KMP_DEBUG_ASSERT(init <= trip); 1506 // keep track of done chunks for possible early exit from stealing 1507 // TODO: count executed chunks locally with rare update of shared location 1508 // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1509 if ((last = (limit >= trip)) != 0) 1510 limit = trip; 1511 if (p_st != NULL) 1512 *p_st = incr; 1513 1514 if (incr == 1) { 1515 *p_lb = start + init; 1516 *p_ub = start + limit; 1517 } else { 1518 *p_lb = start + init * incr; 1519 *p_ub = start + limit * incr; 1520 } 1521 } // if 1522 break; 1523 } // case 1524 #endif // KMP_STATIC_STEAL_ENABLED 1525 case kmp_sch_static_balanced: { 1526 KD_TRACE( 1527 10, 1528 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1529 gtid)); 1530 /* check if thread has any iteration to do */ 1531 if ((status = !pr->u.p.count) != 0) { 1532 pr->u.p.count = 1; 1533 *p_lb = pr->u.p.lb; 1534 *p_ub = pr->u.p.ub; 1535 last = (pr->u.p.parm1 != 0); 1536 if (p_st != NULL) 1537 *p_st = pr->u.p.st; 1538 } else { /* no iterations to do */ 1539 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1540 } 1541 } // case 1542 break; 1543 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1544 merged here */ 1545 case kmp_sch_static_chunked: { 1546 T parm1; 1547 1548 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1549 "kmp_sch_static_[affinity|chunked] case\n", 1550 gtid)); 1551 parm1 = pr->u.p.parm1; 1552 1553 trip = pr->u.p.tc - 1; 1554 init = parm1 * (pr->u.p.count + tid); 1555 1556 if ((status = (init <= trip)) != 0) { 1557 start = pr->u.p.lb; 1558 incr = pr->u.p.st; 1559 limit = parm1 + init - 1; 1560 1561 if ((last = (limit >= trip)) != 0) 1562 limit = trip; 1563 1564 if (p_st != NULL) 1565 *p_st = incr; 1566 1567 pr->u.p.count += nproc; 1568 1569 if (incr == 1) { 1570 *p_lb = start + init; 1571 *p_ub = start + limit; 1572 } else { 1573 *p_lb = start + init * incr; 1574 *p_ub = start + limit * incr; 1575 } 1576 1577 if (pr->flags.ordered) { 1578 pr->u.p.ordered_lower = init; 1579 pr->u.p.ordered_upper = limit; 1580 } // if 1581 } // if 1582 } // case 1583 break; 1584 1585 case kmp_sch_dynamic_chunked: { 1586 UT chunk_number; 1587 UT chunk_size = pr->u.p.parm1; 1588 UT nchunks = pr->u.p.parm2; 1589 1590 KD_TRACE( 1591 100, 1592 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1593 gtid)); 1594 1595 chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1596 status = (chunk_number < nchunks); 1597 if (!status) { 1598 *p_lb = 0; 1599 *p_ub = 0; 1600 if (p_st != NULL) 1601 *p_st = 0; 1602 } else { 1603 init = chunk_size * chunk_number; 1604 trip = pr->u.p.tc - 1; 1605 start = pr->u.p.lb; 1606 incr = pr->u.p.st; 1607 1608 if ((last = (trip - init < (UT)chunk_size))) 1609 limit = trip; 1610 else 1611 limit = chunk_size + init - 1; 1612 1613 if (p_st != NULL) 1614 *p_st = incr; 1615 1616 if (incr == 1) { 1617 *p_lb = start + init; 1618 *p_ub = start + limit; 1619 } else { 1620 *p_lb = start + init * incr; 1621 *p_ub = start + limit * incr; 1622 } 1623 1624 if (pr->flags.ordered) { 1625 pr->u.p.ordered_lower = init; 1626 pr->u.p.ordered_upper = limit; 1627 } // if 1628 } // if 1629 } // case 1630 break; 1631 1632 case kmp_sch_guided_iterative_chunked: { 1633 T chunkspec = pr->u.p.parm1; 1634 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1635 "iterative case\n", 1636 gtid)); 1637 trip = pr->u.p.tc; 1638 // Start atomic part of calculations 1639 while (1) { 1640 ST remaining; // signed, because can be < 0 1641 init = sh->u.s.iteration; // shared value 1642 remaining = trip - init; 1643 if (remaining <= 0) { // AC: need to compare with 0 first 1644 // nothing to do, don't try atomic op 1645 status = 0; 1646 break; 1647 } 1648 if ((T)remaining < 1649 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1650 // use dynamic-style schedule 1651 // atomically increment iterations, get old value 1652 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1653 (ST)chunkspec); 1654 remaining = trip - init; 1655 if (remaining <= 0) { 1656 status = 0; // all iterations got by other threads 1657 } else { 1658 // got some iterations to work on 1659 status = 1; 1660 if ((T)remaining > chunkspec) { 1661 limit = init + chunkspec - 1; 1662 } else { 1663 last = true; // the last chunk 1664 limit = init + remaining - 1; 1665 } // if 1666 } // if 1667 break; 1668 } // if 1669 limit = init + (UT)((double)remaining * 1670 *(double *)&pr->u.p.parm3); // divide by K*nproc 1671 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1672 (ST)init, (ST)limit)) { 1673 // CAS was successful, chunk obtained 1674 status = 1; 1675 --limit; 1676 break; 1677 } // if 1678 } // while 1679 if (status != 0) { 1680 start = pr->u.p.lb; 1681 incr = pr->u.p.st; 1682 if (p_st != NULL) 1683 *p_st = incr; 1684 *p_lb = start + init * incr; 1685 *p_ub = start + limit * incr; 1686 if (pr->flags.ordered) { 1687 pr->u.p.ordered_lower = init; 1688 pr->u.p.ordered_upper = limit; 1689 } // if 1690 } else { 1691 *p_lb = 0; 1692 *p_ub = 0; 1693 if (p_st != NULL) 1694 *p_st = 0; 1695 } // if 1696 } // case 1697 break; 1698 1699 case kmp_sch_guided_simd: { 1700 // same as iterative but curr-chunk adjusted to be multiple of given 1701 // chunk 1702 T chunk = pr->u.p.parm1; 1703 KD_TRACE(100, 1704 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1705 gtid)); 1706 trip = pr->u.p.tc; 1707 // Start atomic part of calculations 1708 while (1) { 1709 ST remaining; // signed, because can be < 0 1710 init = sh->u.s.iteration; // shared value 1711 remaining = trip - init; 1712 if (remaining <= 0) { // AC: need to compare with 0 first 1713 status = 0; // nothing to do, don't try atomic op 1714 break; 1715 } 1716 KMP_DEBUG_ASSERT(init % chunk == 0); 1717 // compare with K*nproc*(chunk+1), K=2 by default 1718 if ((T)remaining < pr->u.p.parm2) { 1719 // use dynamic-style schedule 1720 // atomically increment iterations, get old value 1721 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1722 (ST)chunk); 1723 remaining = trip - init; 1724 if (remaining <= 0) { 1725 status = 0; // all iterations got by other threads 1726 } else { 1727 // got some iterations to work on 1728 status = 1; 1729 if ((T)remaining > chunk) { 1730 limit = init + chunk - 1; 1731 } else { 1732 last = true; // the last chunk 1733 limit = init + remaining - 1; 1734 } // if 1735 } // if 1736 break; 1737 } // if 1738 // divide by K*nproc 1739 UT span; 1740 __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3), 1741 &span); 1742 UT rem = span % chunk; 1743 if (rem) // adjust so that span%chunk == 0 1744 span += chunk - rem; 1745 limit = init + span; 1746 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1747 (ST)init, (ST)limit)) { 1748 // CAS was successful, chunk obtained 1749 status = 1; 1750 --limit; 1751 break; 1752 } // if 1753 } // while 1754 if (status != 0) { 1755 start = pr->u.p.lb; 1756 incr = pr->u.p.st; 1757 if (p_st != NULL) 1758 *p_st = incr; 1759 *p_lb = start + init * incr; 1760 *p_ub = start + limit * incr; 1761 if (pr->flags.ordered) { 1762 pr->u.p.ordered_lower = init; 1763 pr->u.p.ordered_upper = limit; 1764 } // if 1765 } else { 1766 *p_lb = 0; 1767 *p_ub = 0; 1768 if (p_st != NULL) 1769 *p_st = 0; 1770 } // if 1771 } // case 1772 break; 1773 1774 case kmp_sch_guided_analytical_chunked: { 1775 T chunkspec = pr->u.p.parm1; 1776 UT chunkIdx; 1777 #if KMP_USE_X87CONTROL 1778 /* for storing original FPCW value for Windows* OS on 1779 IA-32 architecture 8-byte version */ 1780 unsigned int oldFpcw; 1781 unsigned int fpcwSet = 0; 1782 #endif 1783 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1784 "kmp_sch_guided_analytical_chunked case\n", 1785 gtid)); 1786 1787 trip = pr->u.p.tc; 1788 1789 KMP_DEBUG_ASSERT(nproc > 1); 1790 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1791 1792 while (1) { /* this while loop is a safeguard against unexpected zero 1793 chunk sizes */ 1794 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1795 if (chunkIdx >= (UT)pr->u.p.parm2) { 1796 --trip; 1797 /* use dynamic-style scheduling */ 1798 init = chunkIdx * chunkspec + pr->u.p.count; 1799 /* need to verify init > 0 in case of overflow in the above 1800 * calculation */ 1801 if ((status = (init > 0 && init <= trip)) != 0) { 1802 limit = init + chunkspec - 1; 1803 1804 if ((last = (limit >= trip)) != 0) 1805 limit = trip; 1806 } 1807 break; 1808 } else { 1809 /* use exponential-style scheduling */ 1810 /* The following check is to workaround the lack of long double precision on 1811 Windows* OS. 1812 This check works around the possible effect that init != 0 for chunkIdx == 0. 1813 */ 1814 #if KMP_USE_X87CONTROL 1815 /* If we haven't already done so, save original 1816 FPCW and set precision to 64-bit, as Windows* OS 1817 on IA-32 architecture defaults to 53-bit */ 1818 if (!fpcwSet) { 1819 oldFpcw = _control87(0, 0); 1820 _control87(_PC_64, _MCW_PC); 1821 fpcwSet = 0x30000; 1822 } 1823 #endif 1824 if (chunkIdx) { 1825 init = __kmp_dispatch_guided_remaining<T>( 1826 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1827 KMP_DEBUG_ASSERT(init); 1828 init = trip - init; 1829 } else 1830 init = 0; 1831 limit = trip - __kmp_dispatch_guided_remaining<T>( 1832 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1833 KMP_ASSERT(init <= limit); 1834 if (init < limit) { 1835 KMP_DEBUG_ASSERT(limit <= trip); 1836 --limit; 1837 status = 1; 1838 break; 1839 } // if 1840 } // if 1841 } // while (1) 1842 #if KMP_USE_X87CONTROL 1843 /* restore FPCW if necessary 1844 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1845 */ 1846 if (fpcwSet && (oldFpcw & fpcwSet)) 1847 _control87(oldFpcw, _MCW_PC); 1848 #endif 1849 if (status != 0) { 1850 start = pr->u.p.lb; 1851 incr = pr->u.p.st; 1852 if (p_st != NULL) 1853 *p_st = incr; 1854 *p_lb = start + init * incr; 1855 *p_ub = start + limit * incr; 1856 if (pr->flags.ordered) { 1857 pr->u.p.ordered_lower = init; 1858 pr->u.p.ordered_upper = limit; 1859 } 1860 } else { 1861 *p_lb = 0; 1862 *p_ub = 0; 1863 if (p_st != NULL) 1864 *p_st = 0; 1865 } 1866 } // case 1867 break; 1868 1869 case kmp_sch_trapezoidal: { 1870 UT index; 1871 T parm2 = pr->u.p.parm2; 1872 T parm3 = pr->u.p.parm3; 1873 T parm4 = pr->u.p.parm4; 1874 KD_TRACE(100, 1875 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1876 gtid)); 1877 1878 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1879 1880 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1881 trip = pr->u.p.tc - 1; 1882 1883 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1884 *p_lb = 0; 1885 *p_ub = 0; 1886 if (p_st != NULL) 1887 *p_st = 0; 1888 } else { 1889 start = pr->u.p.lb; 1890 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1891 incr = pr->u.p.st; 1892 1893 if ((last = (limit >= trip)) != 0) 1894 limit = trip; 1895 1896 if (p_st != NULL) 1897 *p_st = incr; 1898 1899 if (incr == 1) { 1900 *p_lb = start + init; 1901 *p_ub = start + limit; 1902 } else { 1903 *p_lb = start + init * incr; 1904 *p_ub = start + limit * incr; 1905 } 1906 1907 if (pr->flags.ordered) { 1908 pr->u.p.ordered_lower = init; 1909 pr->u.p.ordered_upper = limit; 1910 } // if 1911 } // if 1912 } // case 1913 break; 1914 default: { 1915 status = 0; // to avoid complaints on uninitialized variable use 1916 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1917 KMP_HNT(GetNewerLibrary), // Hint 1918 __kmp_msg_null // Variadic argument list terminator 1919 ); 1920 } break; 1921 } // switch 1922 if (p_last) 1923 *p_last = last; 1924 #ifdef KMP_DEBUG 1925 if (pr->flags.ordered) { 1926 char *buff; 1927 // create format specifiers before the debug output 1928 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1929 "ordered_lower:%%%s ordered_upper:%%%s\n", 1930 traits_t<UT>::spec, traits_t<UT>::spec); 1931 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1932 __kmp_str_free(&buff); 1933 } 1934 { 1935 char *buff; 1936 // create format specifiers before the debug output 1937 buff = __kmp_str_format( 1938 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1939 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1940 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1941 KMP_DEBUG_ASSERT(p_last); 1942 KMP_DEBUG_ASSERT(p_st); 1943 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1944 __kmp_str_free(&buff); 1945 } 1946 #endif 1947 return status; 1948 } 1949 1950 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1951 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1952 is not called. */ 1953 #if OMPT_SUPPORT && OMPT_OPTIONAL 1954 #define OMPT_LOOP_END \ 1955 if (status == 0) { \ 1956 if (ompt_enabled.ompt_callback_work) { \ 1957 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1958 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1959 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1960 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1961 &(task_info->task_data), 0, codeptr); \ 1962 } \ 1963 } 1964 // TODO: implement count 1965 #else 1966 #define OMPT_LOOP_END // no-op 1967 #endif 1968 1969 #if KMP_STATS_ENABLED 1970 #define KMP_STATS_LOOP_END \ 1971 { \ 1972 kmp_int64 u, l, t, i; \ 1973 l = (kmp_int64)(*p_lb); \ 1974 u = (kmp_int64)(*p_ub); \ 1975 i = (kmp_int64)(pr->u.p.st); \ 1976 if (status == 0) { \ 1977 t = 0; \ 1978 KMP_POP_PARTITIONED_TIMER(); \ 1979 } else if (i == 1) { \ 1980 if (u >= l) \ 1981 t = u - l + 1; \ 1982 else \ 1983 t = 0; \ 1984 } else if (i < 0) { \ 1985 if (l >= u) \ 1986 t = (l - u) / (-i) + 1; \ 1987 else \ 1988 t = 0; \ 1989 } else { \ 1990 if (u >= l) \ 1991 t = (u - l) / i + 1; \ 1992 else \ 1993 t = 0; \ 1994 } \ 1995 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1996 } 1997 #else 1998 #define KMP_STATS_LOOP_END /* Nothing */ 1999 #endif 2000 2001 template <typename T> 2002 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 2003 T *p_lb, T *p_ub, 2004 typename traits_t<T>::signed_t *p_st 2005 #if OMPT_SUPPORT && OMPT_OPTIONAL 2006 , 2007 void *codeptr 2008 #endif 2009 ) { 2010 2011 typedef typename traits_t<T>::unsigned_t UT; 2012 typedef typename traits_t<T>::signed_t ST; 2013 // This is potentially slightly misleading, schedule(runtime) will appear here 2014 // even if the actual runtime schedule is static. (Which points out a 2015 // disadvantage of schedule(runtime): even when static scheduling is used it 2016 // costs more than a compile time choice to use static scheduling would.) 2017 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 2018 2019 int status; 2020 dispatch_private_info_template<T> *pr; 2021 __kmp_assert_valid_gtid(gtid); 2022 kmp_info_t *th = __kmp_threads[gtid]; 2023 kmp_team_t *team = th->th.th_team; 2024 2025 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 2026 KD_TRACE( 2027 1000, 2028 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 2029 gtid, p_lb, p_ub, p_st, p_last)); 2030 2031 if (team->t.t_serialized) { 2032 /* NOTE: serialize this dispatch because we are not at the active level */ 2033 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2034 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 2035 KMP_DEBUG_ASSERT(pr); 2036 2037 if ((status = (pr->u.p.tc != 0)) == 0) { 2038 *p_lb = 0; 2039 *p_ub = 0; 2040 // if ( p_last != NULL ) 2041 // *p_last = 0; 2042 if (p_st != NULL) 2043 *p_st = 0; 2044 if (__kmp_env_consistency_check) { 2045 if (pr->pushed_ws != ct_none) { 2046 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2047 } 2048 } 2049 } else if (pr->flags.nomerge) { 2050 kmp_int32 last; 2051 T start; 2052 UT limit, trip, init; 2053 ST incr; 2054 T chunk = pr->u.p.parm1; 2055 2056 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 2057 gtid)); 2058 2059 init = chunk * pr->u.p.count++; 2060 trip = pr->u.p.tc - 1; 2061 2062 if ((status = (init <= trip)) == 0) { 2063 *p_lb = 0; 2064 *p_ub = 0; 2065 // if ( p_last != NULL ) 2066 // *p_last = 0; 2067 if (p_st != NULL) 2068 *p_st = 0; 2069 if (__kmp_env_consistency_check) { 2070 if (pr->pushed_ws != ct_none) { 2071 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2072 } 2073 } 2074 } else { 2075 start = pr->u.p.lb; 2076 limit = chunk + init - 1; 2077 incr = pr->u.p.st; 2078 2079 if ((last = (limit >= trip)) != 0) { 2080 limit = trip; 2081 #if KMP_OS_WINDOWS 2082 pr->u.p.last_upper = pr->u.p.ub; 2083 #endif /* KMP_OS_WINDOWS */ 2084 } 2085 if (p_last != NULL) 2086 *p_last = last; 2087 if (p_st != NULL) 2088 *p_st = incr; 2089 if (incr == 1) { 2090 *p_lb = start + init; 2091 *p_ub = start + limit; 2092 } else { 2093 *p_lb = start + init * incr; 2094 *p_ub = start + limit * incr; 2095 } 2096 2097 if (pr->flags.ordered) { 2098 pr->u.p.ordered_lower = init; 2099 pr->u.p.ordered_upper = limit; 2100 #ifdef KMP_DEBUG 2101 { 2102 char *buff; 2103 // create format specifiers before the debug output 2104 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2105 "ordered_lower:%%%s ordered_upper:%%%s\n", 2106 traits_t<UT>::spec, traits_t<UT>::spec); 2107 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2108 pr->u.p.ordered_upper)); 2109 __kmp_str_free(&buff); 2110 } 2111 #endif 2112 } // if 2113 } // if 2114 } else { 2115 pr->u.p.tc = 0; 2116 *p_lb = pr->u.p.lb; 2117 *p_ub = pr->u.p.ub; 2118 #if KMP_OS_WINDOWS 2119 pr->u.p.last_upper = *p_ub; 2120 #endif /* KMP_OS_WINDOWS */ 2121 if (p_last != NULL) 2122 *p_last = TRUE; 2123 if (p_st != NULL) 2124 *p_st = pr->u.p.st; 2125 } // if 2126 #ifdef KMP_DEBUG 2127 { 2128 char *buff; 2129 // create format specifiers before the debug output 2130 buff = __kmp_str_format( 2131 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 2132 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 2133 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2134 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, 2135 (p_last ? *p_last : 0), status)); 2136 __kmp_str_free(&buff); 2137 } 2138 #endif 2139 #if INCLUDE_SSC_MARKS 2140 SSC_MARK_DISPATCH_NEXT(); 2141 #endif 2142 OMPT_LOOP_END; 2143 KMP_STATS_LOOP_END; 2144 return status; 2145 } else { 2146 kmp_int32 last = 0; 2147 dispatch_shared_info_template<T> volatile *sh; 2148 2149 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2150 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2151 2152 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2153 th->th.th_dispatch->th_dispatch_pr_current); 2154 KMP_DEBUG_ASSERT(pr); 2155 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2156 th->th.th_dispatch->th_dispatch_sh_current); 2157 KMP_DEBUG_ASSERT(sh); 2158 2159 #if KMP_USE_HIER_SCHED 2160 if (pr->flags.use_hier) 2161 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2162 else 2163 #endif // KMP_USE_HIER_SCHED 2164 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2165 p_st, th->th.th_team_nproc, 2166 th->th.th_info.ds.ds_tid); 2167 // status == 0: no more iterations to execute 2168 if (status == 0) { 2169 ST num_done; 2170 num_done = test_then_inc<ST>(&sh->u.s.num_done); 2171 #ifdef KMP_DEBUG 2172 { 2173 char *buff; 2174 // create format specifiers before the debug output 2175 buff = __kmp_str_format( 2176 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2177 traits_t<ST>::spec); 2178 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2179 __kmp_str_free(&buff); 2180 } 2181 #endif 2182 2183 #if KMP_USE_HIER_SCHED 2184 pr->flags.use_hier = FALSE; 2185 #endif 2186 if (num_done == th->th.th_team_nproc - 1) { 2187 #if KMP_STATIC_STEAL_ENABLED 2188 if (pr->schedule == kmp_sch_static_steal) { 2189 int i; 2190 int idx = (th->th.th_dispatch->th_disp_index - 1) % 2191 __kmp_dispatch_num_buffers; // current loop index 2192 // loop complete, safe to destroy locks used for stealing 2193 for (i = 0; i < th->th.th_team_nproc; ++i) { 2194 dispatch_private_info_template<T> *buf = 2195 reinterpret_cast<dispatch_private_info_template<T> *>( 2196 &team->t.t_dispatch[i].th_disp_buffer[idx]); 2197 KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive 2198 KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED); 2199 if (traits_t<T>::type_size > 4) { 2200 // destroy locks used for stealing 2201 kmp_lock_t *lck = buf->u.p.steal_lock; 2202 KMP_ASSERT(lck != NULL); 2203 __kmp_destroy_lock(lck); 2204 __kmp_free(lck); 2205 buf->u.p.steal_lock = NULL; 2206 } 2207 } 2208 } 2209 #endif 2210 /* NOTE: release shared buffer to be reused */ 2211 2212 KMP_MB(); /* Flush all pending memory write invalidates. */ 2213 2214 sh->u.s.num_done = 0; 2215 sh->u.s.iteration = 0; 2216 2217 /* TODO replace with general release procedure? */ 2218 if (pr->flags.ordered) { 2219 sh->u.s.ordered_iteration = 0; 2220 } 2221 2222 sh->buffer_index += __kmp_dispatch_num_buffers; 2223 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2224 gtid, sh->buffer_index)); 2225 2226 KMP_MB(); /* Flush all pending memory write invalidates. */ 2227 2228 } // if 2229 if (__kmp_env_consistency_check) { 2230 if (pr->pushed_ws != ct_none) { 2231 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2232 } 2233 } 2234 2235 th->th.th_dispatch->th_deo_fcn = NULL; 2236 th->th.th_dispatch->th_dxo_fcn = NULL; 2237 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2238 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2239 } // if (status == 0) 2240 #if KMP_OS_WINDOWS 2241 else if (last) { 2242 pr->u.p.last_upper = pr->u.p.ub; 2243 } 2244 #endif /* KMP_OS_WINDOWS */ 2245 if (p_last != NULL && status != 0) 2246 *p_last = last; 2247 } // if 2248 2249 #ifdef KMP_DEBUG 2250 { 2251 char *buff; 2252 // create format specifiers before the debug output 2253 buff = __kmp_str_format( 2254 "__kmp_dispatch_next: T#%%d normal case: " 2255 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2256 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2257 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2258 (p_last ? *p_last : 0), status)); 2259 __kmp_str_free(&buff); 2260 } 2261 #endif 2262 #if INCLUDE_SSC_MARKS 2263 SSC_MARK_DISPATCH_NEXT(); 2264 #endif 2265 OMPT_LOOP_END; 2266 KMP_STATS_LOOP_END; 2267 return status; 2268 } 2269 2270 template <typename T> 2271 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2272 kmp_int32 *plastiter, T *plower, T *pupper, 2273 typename traits_t<T>::signed_t incr) { 2274 typedef typename traits_t<T>::unsigned_t UT; 2275 kmp_uint32 team_id; 2276 kmp_uint32 nteams; 2277 UT trip_count; 2278 kmp_team_t *team; 2279 kmp_info_t *th; 2280 2281 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2282 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2283 #ifdef KMP_DEBUG 2284 typedef typename traits_t<T>::signed_t ST; 2285 { 2286 char *buff; 2287 // create format specifiers before the debug output 2288 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2289 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2290 traits_t<T>::spec, traits_t<T>::spec, 2291 traits_t<ST>::spec, traits_t<T>::spec); 2292 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2293 __kmp_str_free(&buff); 2294 } 2295 #endif 2296 2297 if (__kmp_env_consistency_check) { 2298 if (incr == 0) { 2299 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2300 loc); 2301 } 2302 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2303 // The loop is illegal. 2304 // Some zero-trip loops maintained by compiler, e.g.: 2305 // for(i=10;i<0;++i) // lower >= upper - run-time check 2306 // for(i=0;i>10;--i) // lower <= upper - run-time check 2307 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2308 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2309 // Compiler does not check the following illegal loops: 2310 // for(i=0;i<10;i+=incr) // where incr<0 2311 // for(i=10;i>0;i-=incr) // where incr<0 2312 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2313 } 2314 } 2315 __kmp_assert_valid_gtid(gtid); 2316 th = __kmp_threads[gtid]; 2317 team = th->th.th_team; 2318 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2319 nteams = th->th.th_teams_size.nteams; 2320 team_id = team->t.t_master_tid; 2321 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2322 2323 // compute global trip count 2324 if (incr == 1) { 2325 trip_count = *pupper - *plower + 1; 2326 } else if (incr == -1) { 2327 trip_count = *plower - *pupper + 1; 2328 } else if (incr > 0) { 2329 // upper-lower can exceed the limit of signed type 2330 trip_count = (UT)(*pupper - *plower) / incr + 1; 2331 } else { 2332 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2333 } 2334 2335 if (trip_count <= nteams) { 2336 KMP_DEBUG_ASSERT( 2337 __kmp_static == kmp_sch_static_greedy || 2338 __kmp_static == 2339 kmp_sch_static_balanced); // Unknown static scheduling type. 2340 // only some teams get single iteration, others get nothing 2341 if (team_id < trip_count) { 2342 *pupper = *plower = *plower + team_id * incr; 2343 } else { 2344 *plower = *pupper + incr; // zero-trip loop 2345 } 2346 if (plastiter != NULL) 2347 *plastiter = (team_id == trip_count - 1); 2348 } else { 2349 if (__kmp_static == kmp_sch_static_balanced) { 2350 UT chunk = trip_count / nteams; 2351 UT extras = trip_count % nteams; 2352 *plower += 2353 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2354 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2355 if (plastiter != NULL) 2356 *plastiter = (team_id == nteams - 1); 2357 } else { 2358 T chunk_inc_count = 2359 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2360 T upper = *pupper; 2361 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2362 // Unknown static scheduling type. 2363 *plower += team_id * chunk_inc_count; 2364 *pupper = *plower + chunk_inc_count - incr; 2365 // Check/correct bounds if needed 2366 if (incr > 0) { 2367 if (*pupper < *plower) 2368 *pupper = traits_t<T>::max_value; 2369 if (plastiter != NULL) 2370 *plastiter = *plower <= upper && *pupper > upper - incr; 2371 if (*pupper > upper) 2372 *pupper = upper; // tracker C73258 2373 } else { 2374 if (*pupper > *plower) 2375 *pupper = traits_t<T>::min_value; 2376 if (plastiter != NULL) 2377 *plastiter = *plower >= upper && *pupper < upper - incr; 2378 if (*pupper < upper) 2379 *pupper = upper; // tracker C73258 2380 } 2381 } 2382 } 2383 } 2384 2385 //----------------------------------------------------------------------------- 2386 // Dispatch routines 2387 // Transfer call to template< type T > 2388 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2389 // T lb, T ub, ST st, ST chunk ) 2390 extern "C" { 2391 2392 /*! 2393 @ingroup WORK_SHARING 2394 @{ 2395 @param loc Source location 2396 @param gtid Global thread id 2397 @param schedule Schedule type 2398 @param lb Lower bound 2399 @param ub Upper bound 2400 @param st Step (or increment if you prefer) 2401 @param chunk The chunk size to block with 2402 2403 This function prepares the runtime to start a dynamically scheduled for loop, 2404 saving the loop arguments. 2405 These functions are all identical apart from the types of the arguments. 2406 */ 2407 2408 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2409 enum sched_type schedule, kmp_int32 lb, 2410 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2411 KMP_DEBUG_ASSERT(__kmp_init_serial); 2412 #if OMPT_SUPPORT && OMPT_OPTIONAL 2413 OMPT_STORE_RETURN_ADDRESS(gtid); 2414 #endif 2415 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2416 } 2417 /*! 2418 See @ref __kmpc_dispatch_init_4 2419 */ 2420 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2421 enum sched_type schedule, kmp_uint32 lb, 2422 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2423 KMP_DEBUG_ASSERT(__kmp_init_serial); 2424 #if OMPT_SUPPORT && OMPT_OPTIONAL 2425 OMPT_STORE_RETURN_ADDRESS(gtid); 2426 #endif 2427 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2428 } 2429 2430 /*! 2431 See @ref __kmpc_dispatch_init_4 2432 */ 2433 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2434 enum sched_type schedule, kmp_int64 lb, 2435 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2436 KMP_DEBUG_ASSERT(__kmp_init_serial); 2437 #if OMPT_SUPPORT && OMPT_OPTIONAL 2438 OMPT_STORE_RETURN_ADDRESS(gtid); 2439 #endif 2440 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2441 } 2442 2443 /*! 2444 See @ref __kmpc_dispatch_init_4 2445 */ 2446 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2447 enum sched_type schedule, kmp_uint64 lb, 2448 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2449 KMP_DEBUG_ASSERT(__kmp_init_serial); 2450 #if OMPT_SUPPORT && OMPT_OPTIONAL 2451 OMPT_STORE_RETURN_ADDRESS(gtid); 2452 #endif 2453 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2454 } 2455 2456 /*! 2457 See @ref __kmpc_dispatch_init_4 2458 2459 Difference from __kmpc_dispatch_init set of functions is these functions 2460 are called for composite distribute parallel for construct. Thus before 2461 regular iterations dispatching we need to calc per-team iteration space. 2462 2463 These functions are all identical apart from the types of the arguments. 2464 */ 2465 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2466 enum sched_type schedule, kmp_int32 *p_last, 2467 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2468 kmp_int32 chunk) { 2469 KMP_DEBUG_ASSERT(__kmp_init_serial); 2470 #if OMPT_SUPPORT && OMPT_OPTIONAL 2471 OMPT_STORE_RETURN_ADDRESS(gtid); 2472 #endif 2473 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2474 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2475 } 2476 2477 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2478 enum sched_type schedule, kmp_int32 *p_last, 2479 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2480 kmp_int32 chunk) { 2481 KMP_DEBUG_ASSERT(__kmp_init_serial); 2482 #if OMPT_SUPPORT && OMPT_OPTIONAL 2483 OMPT_STORE_RETURN_ADDRESS(gtid); 2484 #endif 2485 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2486 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2487 } 2488 2489 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2490 enum sched_type schedule, kmp_int32 *p_last, 2491 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2492 kmp_int64 chunk) { 2493 KMP_DEBUG_ASSERT(__kmp_init_serial); 2494 #if OMPT_SUPPORT && OMPT_OPTIONAL 2495 OMPT_STORE_RETURN_ADDRESS(gtid); 2496 #endif 2497 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2498 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2499 } 2500 2501 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2502 enum sched_type schedule, kmp_int32 *p_last, 2503 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2504 kmp_int64 chunk) { 2505 KMP_DEBUG_ASSERT(__kmp_init_serial); 2506 #if OMPT_SUPPORT && OMPT_OPTIONAL 2507 OMPT_STORE_RETURN_ADDRESS(gtid); 2508 #endif 2509 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2510 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2511 } 2512 2513 /*! 2514 @param loc Source code location 2515 @param gtid Global thread id 2516 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2517 otherwise 2518 @param p_lb Pointer to the lower bound for the next chunk of work 2519 @param p_ub Pointer to the upper bound for the next chunk of work 2520 @param p_st Pointer to the stride for the next chunk of work 2521 @return one if there is work to be done, zero otherwise 2522 2523 Get the next dynamically allocated chunk of work for this thread. 2524 If there is no more work, then the lb,ub and stride need not be modified. 2525 */ 2526 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2527 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2528 #if OMPT_SUPPORT && OMPT_OPTIONAL 2529 OMPT_STORE_RETURN_ADDRESS(gtid); 2530 #endif 2531 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2532 #if OMPT_SUPPORT && OMPT_OPTIONAL 2533 , 2534 OMPT_LOAD_RETURN_ADDRESS(gtid) 2535 #endif 2536 ); 2537 } 2538 2539 /*! 2540 See @ref __kmpc_dispatch_next_4 2541 */ 2542 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2543 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2544 kmp_int32 *p_st) { 2545 #if OMPT_SUPPORT && OMPT_OPTIONAL 2546 OMPT_STORE_RETURN_ADDRESS(gtid); 2547 #endif 2548 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2549 #if OMPT_SUPPORT && OMPT_OPTIONAL 2550 , 2551 OMPT_LOAD_RETURN_ADDRESS(gtid) 2552 #endif 2553 ); 2554 } 2555 2556 /*! 2557 See @ref __kmpc_dispatch_next_4 2558 */ 2559 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2560 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2561 #if OMPT_SUPPORT && OMPT_OPTIONAL 2562 OMPT_STORE_RETURN_ADDRESS(gtid); 2563 #endif 2564 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2565 #if OMPT_SUPPORT && OMPT_OPTIONAL 2566 , 2567 OMPT_LOAD_RETURN_ADDRESS(gtid) 2568 #endif 2569 ); 2570 } 2571 2572 /*! 2573 See @ref __kmpc_dispatch_next_4 2574 */ 2575 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2576 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2577 kmp_int64 *p_st) { 2578 #if OMPT_SUPPORT && OMPT_OPTIONAL 2579 OMPT_STORE_RETURN_ADDRESS(gtid); 2580 #endif 2581 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2582 #if OMPT_SUPPORT && OMPT_OPTIONAL 2583 , 2584 OMPT_LOAD_RETURN_ADDRESS(gtid) 2585 #endif 2586 ); 2587 } 2588 2589 /*! 2590 @param loc Source code location 2591 @param gtid Global thread id 2592 2593 Mark the end of a dynamic loop. 2594 */ 2595 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2596 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2597 } 2598 2599 /*! 2600 See @ref __kmpc_dispatch_fini_4 2601 */ 2602 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2603 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2604 } 2605 2606 /*! 2607 See @ref __kmpc_dispatch_fini_4 2608 */ 2609 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2610 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2611 } 2612 2613 /*! 2614 See @ref __kmpc_dispatch_fini_4 2615 */ 2616 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2617 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2618 } 2619 /*! @} */ 2620 2621 //----------------------------------------------------------------------------- 2622 // Non-template routines from kmp_dispatch.cpp used in other sources 2623 2624 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2625 return value == checker; 2626 } 2627 2628 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2629 return value != checker; 2630 } 2631 2632 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2633 return value < checker; 2634 } 2635 2636 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2637 return value >= checker; 2638 } 2639 2640 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2641 return value <= checker; 2642 } 2643 2644 kmp_uint32 2645 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2646 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2647 void *obj // Higher-level synchronization object, or NULL. 2648 ) { 2649 // note: we may not belong to a team at this point 2650 volatile kmp_uint32 *spin = spinner; 2651 kmp_uint32 check = checker; 2652 kmp_uint32 spins; 2653 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2654 kmp_uint32 r; 2655 2656 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2657 KMP_INIT_YIELD(spins); 2658 // main wait spin loop 2659 while (!f(r = TCR_4(*spin), check)) { 2660 KMP_FSYNC_SPIN_PREPARE(obj); 2661 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2662 split. It causes problems with infinite recursion because of exit lock */ 2663 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2664 __kmp_abort_thread(); */ 2665 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2666 } 2667 KMP_FSYNC_SPIN_ACQUIRED(obj); 2668 return r; 2669 } 2670 2671 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, 2672 kmp_uint32 (*pred)(void *, kmp_uint32), 2673 void *obj // Higher-level synchronization object, or NULL. 2674 ) { 2675 // note: we may not belong to a team at this point 2676 void *spin = spinner; 2677 kmp_uint32 check = checker; 2678 kmp_uint32 spins; 2679 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2680 2681 KMP_FSYNC_SPIN_INIT(obj, spin); 2682 KMP_INIT_YIELD(spins); 2683 // main wait spin loop 2684 while (!f(spin, check)) { 2685 KMP_FSYNC_SPIN_PREPARE(obj); 2686 /* if we have waited a bit, or are noversubscribed, yield */ 2687 /* pause is in the following code */ 2688 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2689 } 2690 KMP_FSYNC_SPIN_ACQUIRED(obj); 2691 } 2692 2693 } // extern "C" 2694 2695 #ifdef KMP_GOMP_COMPAT 2696 2697 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2698 enum sched_type schedule, kmp_int32 lb, 2699 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2700 int push_ws) { 2701 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2702 push_ws); 2703 } 2704 2705 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2706 enum sched_type schedule, kmp_uint32 lb, 2707 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2708 int push_ws) { 2709 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2710 push_ws); 2711 } 2712 2713 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2714 enum sched_type schedule, kmp_int64 lb, 2715 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2716 int push_ws) { 2717 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2718 push_ws); 2719 } 2720 2721 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2722 enum sched_type schedule, kmp_uint64 lb, 2723 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2724 int push_ws) { 2725 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2726 push_ws); 2727 } 2728 2729 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2730 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2731 } 2732 2733 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2734 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2735 } 2736 2737 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2738 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2739 } 2740 2741 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2742 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2743 } 2744 2745 #endif /* KMP_GOMP_COMPAT */ 2746 2747 /* ------------------------------------------------------------------------ */ 2748