1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 /* Dynamic scheduling initialization and dispatch. 14 * 15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 16 * it may change values between parallel regions. __kmp_max_nth 17 * is the largest value __kmp_nth may take, 1 is the smallest. 18 */ 19 20 #include "kmp.h" 21 #include "kmp_error.h" 22 #include "kmp_i18n.h" 23 #include "kmp_itt.h" 24 #include "kmp_stats.h" 25 #include "kmp_str.h" 26 #if KMP_USE_X87CONTROL 27 #include <float.h> 28 #endif 29 #include "kmp_lock.h" 30 #include "kmp_dispatch.h" 31 #if KMP_USE_HIER_SCHED 32 #include "kmp_dispatch_hier.h" 33 #endif 34 35 #if OMPT_SUPPORT 36 #include "ompt-specific.h" 37 #endif 38 39 /* ------------------------------------------------------------------------ */ 40 /* ------------------------------------------------------------------------ */ 41 42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 43 kmp_info_t *th; 44 45 KMP_DEBUG_ASSERT(gtid_ref); 46 47 if (__kmp_env_consistency_check) { 48 th = __kmp_threads[*gtid_ref]; 49 if (th->th.th_root->r.r_active && 50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 51 #if KMP_USE_DYNAMIC_LOCK 52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 53 #else 54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 55 #endif 56 } 57 } 58 } 59 60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 61 kmp_info_t *th; 62 63 if (__kmp_env_consistency_check) { 64 th = __kmp_threads[*gtid_ref]; 65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 67 } 68 } 69 } 70 71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC 72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule, 73 bool use_hier = false) { 74 // Pick up the nonmonotonic/monotonic bits from the scheduling type 75 // Nonmonotonic as default for dynamic schedule when no modifier is specified 76 int monotonicity = SCHEDULE_NONMONOTONIC; 77 78 // Let default be monotonic for executables 79 // compiled with OpenMP* 4.5 or less compilers 80 if (loc != NULL && loc->get_openmp_version() < 50) 81 monotonicity = SCHEDULE_MONOTONIC; 82 83 if (use_hier || __kmp_force_monotonic) 84 monotonicity = SCHEDULE_MONOTONIC; 85 else if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 86 monotonicity = SCHEDULE_NONMONOTONIC; 87 else if (SCHEDULE_HAS_MONOTONIC(schedule)) 88 monotonicity = SCHEDULE_MONOTONIC; 89 90 return monotonicity; 91 } 92 93 #if KMP_STATIC_STEAL_ENABLED 94 enum { // values for steal_flag (possible states of private per-loop buffer) 95 UNUSED = 0, 96 CLAIMED = 1, // owner thread started initialization 97 READY = 2, // available for stealing 98 THIEF = 3 // finished by owner, or claimed by thief 99 // possible state changes: 100 // 0 -> 1 owner only, sync 101 // 0 -> 3 thief only, sync 102 // 1 -> 2 owner only, async 103 // 2 -> 3 owner only, async 104 // 3 -> 2 owner only, async 105 // 3 -> 0 last thread finishing the loop, async 106 }; 107 #endif 108 109 // Initialize a dispatch_private_info_template<T> buffer for a particular 110 // type of schedule,chunk. The loop description is found in lb (lower bound), 111 // ub (upper bound), and st (stride). nproc is the number of threads relevant 112 // to the scheduling (often the number of threads in a team, but not always if 113 // hierarchical scheduling is used). tid is the id of the thread calling 114 // the function within the group of nproc threads. It will have a value 115 // between 0 and nproc - 1. This is often just the thread id within a team, but 116 // is not necessarily the case when using hierarchical scheduling. 117 // loc is the source file location of the corresponding loop 118 // gtid is the global thread id 119 template <typename T> 120 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 121 dispatch_private_info_template<T> *pr, 122 enum sched_type schedule, T lb, T ub, 123 typename traits_t<T>::signed_t st, 124 #if USE_ITT_BUILD 125 kmp_uint64 *cur_chunk, 126 #endif 127 typename traits_t<T>::signed_t chunk, 128 T nproc, T tid) { 129 typedef typename traits_t<T>::unsigned_t UT; 130 typedef typename traits_t<T>::floating_t DBL; 131 132 int active; 133 T tc; 134 kmp_info_t *th; 135 kmp_team_t *team; 136 int monotonicity; 137 bool use_hier; 138 139 #ifdef KMP_DEBUG 140 typedef typename traits_t<T>::signed_t ST; 141 { 142 char *buff; 143 // create format specifiers before the debug output 144 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 145 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 146 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 147 traits_t<T>::spec, traits_t<T>::spec, 148 traits_t<ST>::spec, traits_t<ST>::spec, 149 traits_t<T>::spec, traits_t<T>::spec); 150 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 151 __kmp_str_free(&buff); 152 } 153 #endif 154 /* setup data */ 155 th = __kmp_threads[gtid]; 156 team = th->th.th_team; 157 active = !team->t.t_serialized; 158 159 #if USE_ITT_BUILD 160 int itt_need_metadata_reporting = 161 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 162 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 163 team->t.t_active_level == 1; 164 #endif 165 166 #if KMP_USE_HIER_SCHED 167 use_hier = pr->flags.use_hier; 168 #else 169 use_hier = false; 170 #endif 171 172 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */ 173 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 174 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 175 176 /* Pick up the nomerge/ordered bits from the scheduling type */ 177 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 178 pr->flags.nomerge = TRUE; 179 schedule = 180 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 181 } else { 182 pr->flags.nomerge = FALSE; 183 } 184 pr->type_size = traits_t<T>::type_size; // remember the size of variables 185 if (kmp_ord_lower & schedule) { 186 pr->flags.ordered = TRUE; 187 schedule = 188 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 189 } else { 190 pr->flags.ordered = FALSE; 191 } 192 // Ordered overrides nonmonotonic 193 if (pr->flags.ordered) { 194 monotonicity = SCHEDULE_MONOTONIC; 195 } 196 197 if (schedule == kmp_sch_static) { 198 schedule = __kmp_static; 199 } else { 200 if (schedule == kmp_sch_runtime) { 201 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 202 // not specified) 203 schedule = team->t.t_sched.r_sched_type; 204 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 205 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 206 if (pr->flags.ordered) // correct monotonicity for ordered loop if needed 207 monotonicity = SCHEDULE_MONOTONIC; 208 // Detail the schedule if needed (global controls are differentiated 209 // appropriately) 210 if (schedule == kmp_sch_guided_chunked) { 211 schedule = __kmp_guided; 212 } else if (schedule == kmp_sch_static) { 213 schedule = __kmp_static; 214 } 215 // Use the chunk size specified by OMP_SCHEDULE (or default if not 216 // specified) 217 chunk = team->t.t_sched.chunk; 218 #if USE_ITT_BUILD 219 if (cur_chunk) 220 *cur_chunk = chunk; 221 #endif 222 #ifdef KMP_DEBUG 223 { 224 char *buff; 225 // create format specifiers before the debug output 226 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 227 "schedule:%%d chunk:%%%s\n", 228 traits_t<ST>::spec); 229 KD_TRACE(10, (buff, gtid, schedule, chunk)); 230 __kmp_str_free(&buff); 231 } 232 #endif 233 } else { 234 if (schedule == kmp_sch_guided_chunked) { 235 schedule = __kmp_guided; 236 } 237 if (chunk <= 0) { 238 chunk = KMP_DEFAULT_CHUNK; 239 } 240 } 241 242 if (schedule == kmp_sch_auto) { 243 // mapping and differentiation: in the __kmp_do_serial_initialize() 244 schedule = __kmp_auto; 245 #ifdef KMP_DEBUG 246 { 247 char *buff; 248 // create format specifiers before the debug output 249 buff = __kmp_str_format( 250 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 251 "schedule:%%d chunk:%%%s\n", 252 traits_t<ST>::spec); 253 KD_TRACE(10, (buff, gtid, schedule, chunk)); 254 __kmp_str_free(&buff); 255 } 256 #endif 257 } 258 #if KMP_STATIC_STEAL_ENABLED 259 // map nonmonotonic:dynamic to static steal 260 if (schedule == kmp_sch_dynamic_chunked) { 261 if (monotonicity == SCHEDULE_NONMONOTONIC) 262 schedule = kmp_sch_static_steal; 263 } 264 #endif 265 /* guided analytical not safe for too many threads */ 266 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 267 schedule = kmp_sch_guided_iterative_chunked; 268 KMP_WARNING(DispatchManyThreads); 269 } 270 if (schedule == kmp_sch_runtime_simd) { 271 // compiler provides simd_width in the chunk parameter 272 schedule = team->t.t_sched.r_sched_type; 273 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 274 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 275 // Detail the schedule if needed (global controls are differentiated 276 // appropriately) 277 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 278 schedule == __kmp_static) { 279 schedule = kmp_sch_static_balanced_chunked; 280 } else { 281 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 282 schedule = kmp_sch_guided_simd; 283 } 284 chunk = team->t.t_sched.chunk * chunk; 285 } 286 #if USE_ITT_BUILD 287 if (cur_chunk) 288 *cur_chunk = chunk; 289 #endif 290 #ifdef KMP_DEBUG 291 { 292 char *buff; 293 // create format specifiers before the debug output 294 buff = __kmp_str_format( 295 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d" 296 " chunk:%%%s\n", 297 traits_t<ST>::spec); 298 KD_TRACE(10, (buff, gtid, schedule, chunk)); 299 __kmp_str_free(&buff); 300 } 301 #endif 302 } 303 pr->u.p.parm1 = chunk; 304 } 305 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 306 "unknown scheduling type"); 307 308 pr->u.p.count = 0; 309 310 if (__kmp_env_consistency_check) { 311 if (st == 0) { 312 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 313 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 314 } 315 } 316 // compute trip count 317 if (st == 1) { // most common case 318 if (ub >= lb) { 319 tc = ub - lb + 1; 320 } else { // ub < lb 321 tc = 0; // zero-trip 322 } 323 } else if (st < 0) { 324 if (lb >= ub) { 325 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 326 // where the division needs to be unsigned regardless of the result type 327 tc = (UT)(lb - ub) / (-st) + 1; 328 } else { // lb < ub 329 tc = 0; // zero-trip 330 } 331 } else { // st > 0 332 if (ub >= lb) { 333 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 334 // where the division needs to be unsigned regardless of the result type 335 tc = (UT)(ub - lb) / st + 1; 336 } else { // ub < lb 337 tc = 0; // zero-trip 338 } 339 } 340 341 #if KMP_STATS_ENABLED 342 if (KMP_MASTER_GTID(gtid)) { 343 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc); 344 } 345 #endif 346 347 pr->u.p.lb = lb; 348 pr->u.p.ub = ub; 349 pr->u.p.st = st; 350 pr->u.p.tc = tc; 351 352 #if KMP_OS_WINDOWS 353 pr->u.p.last_upper = ub + st; 354 #endif /* KMP_OS_WINDOWS */ 355 356 /* NOTE: only the active parallel region(s) has active ordered sections */ 357 358 if (active) { 359 if (pr->flags.ordered) { 360 pr->ordered_bumped = 0; 361 pr->u.p.ordered_lower = 1; 362 pr->u.p.ordered_upper = 0; 363 } 364 } 365 366 switch (schedule) { 367 #if KMP_STATIC_STEAL_ENABLED 368 case kmp_sch_static_steal: { 369 T ntc, init; 370 371 KD_TRACE(100, 372 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 373 gtid)); 374 375 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 376 if (nproc > 1 && ntc >= nproc) { 377 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 378 T id = tid; 379 T small_chunk, extras; 380 kmp_uint32 old = UNUSED; 381 int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED); 382 if (traits_t<T>::type_size > 4) { 383 // AC: TODO: check if 16-byte CAS available and use it to 384 // improve performance (probably wait for explicit request 385 // before spending time on this). 386 // For now use dynamically allocated per-private-buffer lock, 387 // free memory in __kmp_dispatch_next when status==0. 388 pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 389 __kmp_init_lock(pr->u.p.steal_lock); 390 } 391 small_chunk = ntc / nproc; 392 extras = ntc % nproc; 393 394 init = id * small_chunk + (id < extras ? id : extras); 395 pr->u.p.count = init; 396 if (claimed) { // are we succeeded in claiming own buffer? 397 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 398 // Other threads will inspect steal_flag when searching for a victim. 399 // READY means other threads may steal from this thread from now on. 400 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 401 } else { 402 // other thread has stolen whole our range 403 KMP_DEBUG_ASSERT(pr->steal_flag == THIEF); 404 pr->u.p.ub = init; // mark there is no iterations to work on 405 } 406 pr->u.p.parm2 = ntc; // save number of chunks 407 // parm3 is the number of times to attempt stealing which is 408 // nproc (just a heuristics, could be optimized later on). 409 pr->u.p.parm3 = nproc; 410 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 411 break; 412 } else { 413 /* too few chunks: switching to kmp_sch_dynamic_chunked */ 414 schedule = kmp_sch_dynamic_chunked; 415 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to " 416 "kmp_sch_dynamic_chunked\n", 417 gtid)); 418 goto dynamic_init; 419 break; 420 } // if 421 } // case 422 #endif 423 case kmp_sch_static_balanced: { 424 T init, limit; 425 426 KD_TRACE( 427 100, 428 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 429 gtid)); 430 431 if (nproc > 1) { 432 T id = tid; 433 434 if (tc < nproc) { 435 if (id < tc) { 436 init = id; 437 limit = id; 438 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 439 } else { 440 pr->u.p.count = 1; /* means no more chunks to execute */ 441 pr->u.p.parm1 = FALSE; 442 break; 443 } 444 } else { 445 T small_chunk = tc / nproc; 446 T extras = tc % nproc; 447 init = id * small_chunk + (id < extras ? id : extras); 448 limit = init + small_chunk - (id < extras ? 0 : 1); 449 pr->u.p.parm1 = (id == nproc - 1); 450 } 451 } else { 452 if (tc > 0) { 453 init = 0; 454 limit = tc - 1; 455 pr->u.p.parm1 = TRUE; 456 } else { 457 // zero trip count 458 pr->u.p.count = 1; /* means no more chunks to execute */ 459 pr->u.p.parm1 = FALSE; 460 break; 461 } 462 } 463 #if USE_ITT_BUILD 464 // Calculate chunk for metadata report 465 if (itt_need_metadata_reporting) 466 if (cur_chunk) 467 *cur_chunk = limit - init + 1; 468 #endif 469 if (st == 1) { 470 pr->u.p.lb = lb + init; 471 pr->u.p.ub = lb + limit; 472 } else { 473 // calculated upper bound, "ub" is user-defined upper bound 474 T ub_tmp = lb + limit * st; 475 pr->u.p.lb = lb + init * st; 476 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 477 // it exactly 478 if (st > 0) { 479 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 480 } else { 481 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 482 } 483 } 484 if (pr->flags.ordered) { 485 pr->u.p.ordered_lower = init; 486 pr->u.p.ordered_upper = limit; 487 } 488 break; 489 } // case 490 case kmp_sch_static_balanced_chunked: { 491 // similar to balanced, but chunk adjusted to multiple of simd width 492 T nth = nproc; 493 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 494 " -> falling-through to static_greedy\n", 495 gtid)); 496 schedule = kmp_sch_static_greedy; 497 if (nth > 1) 498 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 499 else 500 pr->u.p.parm1 = tc; 501 break; 502 } // case 503 case kmp_sch_guided_simd: 504 case kmp_sch_guided_iterative_chunked: { 505 KD_TRACE( 506 100, 507 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 508 " case\n", 509 gtid)); 510 511 if (nproc > 1) { 512 if ((2L * chunk + 1) * nproc >= tc) { 513 /* chunk size too large, switch to dynamic */ 514 schedule = kmp_sch_dynamic_chunked; 515 goto dynamic_init; 516 } else { 517 // when remaining iters become less than parm2 - switch to dynamic 518 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 519 *(double *)&pr->u.p.parm3 = 520 guided_flt_param / (double)nproc; // may occupy parm3 and parm4 521 } 522 } else { 523 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 524 "kmp_sch_static_greedy\n", 525 gtid)); 526 schedule = kmp_sch_static_greedy; 527 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 528 KD_TRACE( 529 100, 530 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 531 gtid)); 532 pr->u.p.parm1 = tc; 533 } // if 534 } // case 535 break; 536 case kmp_sch_guided_analytical_chunked: { 537 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 538 "kmp_sch_guided_analytical_chunked case\n", 539 gtid)); 540 541 if (nproc > 1) { 542 if ((2L * chunk + 1) * nproc >= tc) { 543 /* chunk size too large, switch to dynamic */ 544 schedule = kmp_sch_dynamic_chunked; 545 goto dynamic_init; 546 } else { 547 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 548 DBL x; 549 550 #if KMP_USE_X87CONTROL 551 /* Linux* OS already has 64-bit computation by default for long double, 552 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 553 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 554 instead of the default 53-bit. Even though long double doesn't work 555 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 556 expected to impact the correctness of the algorithm, but this has not 557 been mathematically proven. */ 558 // save original FPCW and set precision to 64-bit, as 559 // Windows* OS on IA-32 architecture defaults to 53-bit 560 unsigned int oldFpcw = _control87(0, 0); 561 _control87(_PC_64, _MCW_PC); // 0,0x30000 562 #endif 563 /* value used for comparison in solver for cross-over point */ 564 KMP_ASSERT(tc > 0); 565 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 566 567 /* crossover point--chunk indexes equal to or greater than 568 this point switch to dynamic-style scheduling */ 569 UT cross; 570 571 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 572 x = 1.0 - 0.5 / (double)nproc; 573 574 #ifdef KMP_DEBUG 575 { // test natural alignment 576 struct _test_a { 577 char a; 578 union { 579 char b; 580 DBL d; 581 }; 582 } t; 583 ptrdiff_t natural_alignment = 584 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 585 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 586 // long)natural_alignment ); 587 KMP_DEBUG_ASSERT( 588 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 589 } 590 #endif // KMP_DEBUG 591 592 /* save the term in thread private dispatch structure */ 593 *(DBL *)&pr->u.p.parm3 = x; 594 595 /* solve for the crossover point to the nearest integer i for which C_i 596 <= chunk */ 597 { 598 UT left, right, mid; 599 long double p; 600 601 /* estimate initial upper and lower bound */ 602 603 /* doesn't matter what value right is as long as it is positive, but 604 it affects performance of the solver */ 605 right = 229; 606 p = __kmp_pow<UT>(x, right); 607 if (p > target) { 608 do { 609 p *= p; 610 right <<= 1; 611 } while (p > target && right < (1 << 27)); 612 /* lower bound is previous (failed) estimate of upper bound */ 613 left = right >> 1; 614 } else { 615 left = 0; 616 } 617 618 /* bisection root-finding method */ 619 while (left + 1 < right) { 620 mid = (left + right) / 2; 621 if (__kmp_pow<UT>(x, mid) > target) { 622 left = mid; 623 } else { 624 right = mid; 625 } 626 } // while 627 cross = right; 628 } 629 /* assert sanity of computed crossover point */ 630 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 631 __kmp_pow<UT>(x, cross) <= target); 632 633 /* save the crossover point in thread private dispatch structure */ 634 pr->u.p.parm2 = cross; 635 636 // C75803 637 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 638 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 639 #else 640 #define GUIDED_ANALYTICAL_WORKAROUND (x) 641 #endif 642 /* dynamic-style scheduling offset */ 643 pr->u.p.count = tc - 644 __kmp_dispatch_guided_remaining( 645 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 646 cross * chunk; 647 #if KMP_USE_X87CONTROL 648 // restore FPCW 649 _control87(oldFpcw, _MCW_PC); 650 #endif 651 } // if 652 } else { 653 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 654 "kmp_sch_static_greedy\n", 655 gtid)); 656 schedule = kmp_sch_static_greedy; 657 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 658 pr->u.p.parm1 = tc; 659 } // if 660 } // case 661 break; 662 case kmp_sch_static_greedy: 663 KD_TRACE( 664 100, 665 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 666 gtid)); 667 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 668 break; 669 case kmp_sch_static_chunked: 670 case kmp_sch_dynamic_chunked: 671 dynamic_init: 672 if (tc == 0) 673 break; 674 if (pr->u.p.parm1 <= 0) 675 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 676 else if (pr->u.p.parm1 > tc) 677 pr->u.p.parm1 = tc; 678 // Store the total number of chunks to prevent integer overflow during 679 // bounds calculations in the get next chunk routine. 680 pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0); 681 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 682 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 683 gtid)); 684 break; 685 case kmp_sch_trapezoidal: { 686 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 687 688 T parm1, parm2, parm3, parm4; 689 KD_TRACE(100, 690 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 691 gtid)); 692 693 parm1 = chunk; 694 695 /* F : size of the first cycle */ 696 parm2 = (tc / (2 * nproc)); 697 698 if (parm2 < 1) { 699 parm2 = 1; 700 } 701 702 /* L : size of the last cycle. Make sure the last cycle is not larger 703 than the first cycle. */ 704 if (parm1 < 1) { 705 parm1 = 1; 706 } else if (parm1 > parm2) { 707 parm1 = parm2; 708 } 709 710 /* N : number of cycles */ 711 parm3 = (parm2 + parm1); 712 parm3 = (2 * tc + parm3 - 1) / parm3; 713 714 if (parm3 < 2) { 715 parm3 = 2; 716 } 717 718 /* sigma : decreasing incr of the trapezoid */ 719 parm4 = (parm3 - 1); 720 parm4 = (parm2 - parm1) / parm4; 721 722 // pointless check, because parm4 >= 0 always 723 // if ( parm4 < 0 ) { 724 // parm4 = 0; 725 //} 726 727 pr->u.p.parm1 = parm1; 728 pr->u.p.parm2 = parm2; 729 pr->u.p.parm3 = parm3; 730 pr->u.p.parm4 = parm4; 731 } // case 732 break; 733 734 default: { 735 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 736 KMP_HNT(GetNewerLibrary), // Hint 737 __kmp_msg_null // Variadic argument list terminator 738 ); 739 } break; 740 } // switch 741 pr->schedule = schedule; 742 } 743 744 #if KMP_USE_HIER_SCHED 745 template <typename T> 746 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 747 typename traits_t<T>::signed_t st); 748 template <> 749 inline void 750 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 751 kmp_int32 ub, kmp_int32 st) { 752 __kmp_dispatch_init_hierarchy<kmp_int32>( 753 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 754 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 755 } 756 template <> 757 inline void 758 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 759 kmp_uint32 ub, kmp_int32 st) { 760 __kmp_dispatch_init_hierarchy<kmp_uint32>( 761 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 762 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 763 } 764 template <> 765 inline void 766 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 767 kmp_int64 ub, kmp_int64 st) { 768 __kmp_dispatch_init_hierarchy<kmp_int64>( 769 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 770 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 771 } 772 template <> 773 inline void 774 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 775 kmp_uint64 ub, kmp_int64 st) { 776 __kmp_dispatch_init_hierarchy<kmp_uint64>( 777 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 778 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 779 } 780 781 // free all the hierarchy scheduling memory associated with the team 782 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 783 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 784 for (int i = 0; i < num_disp_buff; ++i) { 785 // type does not matter here so use kmp_int32 786 auto sh = 787 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 788 &team->t.t_disp_buffer[i]); 789 if (sh->hier) { 790 sh->hier->deallocate(); 791 __kmp_free(sh->hier); 792 } 793 } 794 } 795 #endif 796 797 // UT - unsigned flavor of T, ST - signed flavor of T, 798 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 799 template <typename T> 800 static void 801 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 802 T ub, typename traits_t<T>::signed_t st, 803 typename traits_t<T>::signed_t chunk, int push_ws) { 804 typedef typename traits_t<T>::unsigned_t UT; 805 806 int active; 807 kmp_info_t *th; 808 kmp_team_t *team; 809 kmp_uint32 my_buffer_index; 810 dispatch_private_info_template<T> *pr; 811 dispatch_shared_info_template<T> volatile *sh; 812 813 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 814 sizeof(dispatch_private_info)); 815 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 816 sizeof(dispatch_shared_info)); 817 __kmp_assert_valid_gtid(gtid); 818 819 if (!TCR_4(__kmp_init_parallel)) 820 __kmp_parallel_initialize(); 821 822 __kmp_resume_if_soft_paused(); 823 824 #if INCLUDE_SSC_MARKS 825 SSC_MARK_DISPATCH_INIT(); 826 #endif 827 #ifdef KMP_DEBUG 828 typedef typename traits_t<T>::signed_t ST; 829 { 830 char *buff; 831 // create format specifiers before the debug output 832 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 833 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 834 traits_t<ST>::spec, traits_t<T>::spec, 835 traits_t<T>::spec, traits_t<ST>::spec); 836 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 837 __kmp_str_free(&buff); 838 } 839 #endif 840 /* setup data */ 841 th = __kmp_threads[gtid]; 842 team = th->th.th_team; 843 active = !team->t.t_serialized; 844 th->th.th_ident = loc; 845 846 // Any half-decent optimizer will remove this test when the blocks are empty 847 // since the macros expand to nothing 848 // when statistics are disabled. 849 if (schedule == __kmp_static) { 850 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 851 } else { 852 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 853 } 854 855 #if KMP_USE_HIER_SCHED 856 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 857 // Hierarchical scheduling does not work with ordered, so if ordered is 858 // detected, then revert back to threaded scheduling. 859 bool ordered; 860 enum sched_type my_sched = schedule; 861 my_buffer_index = th->th.th_dispatch->th_disp_index; 862 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 863 &th->th.th_dispatch 864 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 865 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 866 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 867 my_sched = 868 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 869 ordered = (kmp_ord_lower & my_sched); 870 if (pr->flags.use_hier) { 871 if (ordered) { 872 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 873 "Disabling hierarchical scheduling.\n", 874 gtid)); 875 pr->flags.use_hier = FALSE; 876 } 877 } 878 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 879 // Don't use hierarchical for ordered parallel loops and don't 880 // use the runtime hierarchy if one was specified in the program 881 if (!ordered && !pr->flags.use_hier) 882 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 883 } 884 #endif // KMP_USE_HIER_SCHED 885 886 #if USE_ITT_BUILD 887 kmp_uint64 cur_chunk = chunk; 888 int itt_need_metadata_reporting = 889 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 890 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 891 team->t.t_active_level == 1; 892 #endif 893 if (!active) { 894 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 895 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 896 } else { 897 KMP_DEBUG_ASSERT(th->th.th_dispatch == 898 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 899 900 my_buffer_index = th->th.th_dispatch->th_disp_index++; 901 902 /* What happens when number of threads changes, need to resize buffer? */ 903 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 904 &th->th.th_dispatch 905 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 906 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 907 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 908 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 909 my_buffer_index)); 910 if (sh->buffer_index != my_buffer_index) { // too many loops in progress? 911 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d" 912 " sh->buffer_index:%d\n", 913 gtid, my_buffer_index, sh->buffer_index)); 914 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, 915 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 916 // Note: KMP_WAIT() cannot be used there: buffer index and 917 // my_buffer_index are *always* 32-bit integers. 918 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 919 "sh->buffer_index:%d\n", 920 gtid, my_buffer_index, sh->buffer_index)); 921 } 922 } 923 924 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 925 #if USE_ITT_BUILD 926 &cur_chunk, 927 #endif 928 chunk, (T)th->th.th_team_nproc, 929 (T)th->th.th_info.ds.ds_tid); 930 if (active) { 931 if (pr->flags.ordered == 0) { 932 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 933 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 934 } else { 935 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 936 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 937 } 938 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 939 th->th.th_dispatch->th_dispatch_sh_current = 940 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 941 #if USE_ITT_BUILD 942 if (pr->flags.ordered) { 943 __kmp_itt_ordered_init(gtid); 944 } 945 // Report loop metadata 946 if (itt_need_metadata_reporting) { 947 // Only report metadata by primary thread of active team at level 1 948 kmp_uint64 schedtype = 0; 949 switch (schedule) { 950 case kmp_sch_static_chunked: 951 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 952 break; 953 case kmp_sch_static_greedy: 954 cur_chunk = pr->u.p.parm1; 955 break; 956 case kmp_sch_dynamic_chunked: 957 schedtype = 1; 958 break; 959 case kmp_sch_guided_iterative_chunked: 960 case kmp_sch_guided_analytical_chunked: 961 case kmp_sch_guided_simd: 962 schedtype = 2; 963 break; 964 default: 965 // Should we put this case under "static"? 966 // case kmp_sch_static_steal: 967 schedtype = 3; 968 break; 969 } 970 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 971 } 972 #if KMP_USE_HIER_SCHED 973 if (pr->flags.use_hier) { 974 pr->u.p.count = 0; 975 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 976 } 977 #endif // KMP_USER_HIER_SCHED 978 #endif /* USE_ITT_BUILD */ 979 } 980 981 #ifdef KMP_DEBUG 982 { 983 char *buff; 984 // create format specifiers before the debug output 985 buff = __kmp_str_format( 986 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 987 "lb:%%%s ub:%%%s" 988 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 989 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 990 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 991 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 992 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 993 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 994 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 995 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 996 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 997 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 998 __kmp_str_free(&buff); 999 } 1000 #endif 1001 #if OMPT_SUPPORT && OMPT_OPTIONAL 1002 if (ompt_enabled.ompt_callback_work) { 1003 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1004 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 1005 ompt_callbacks.ompt_callback(ompt_callback_work)( 1006 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 1007 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 1008 } 1009 #endif 1010 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 1011 } 1012 1013 /* For ordered loops, either __kmp_dispatch_finish() should be called after 1014 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1015 * every chunk of iterations. If the ordered section(s) were not executed 1016 * for this iteration (or every iteration in this chunk), we need to set the 1017 * ordered iteration counters so that the next thread can proceed. */ 1018 template <typename UT> 1019 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 1020 typedef typename traits_t<UT>::signed_t ST; 1021 __kmp_assert_valid_gtid(gtid); 1022 kmp_info_t *th = __kmp_threads[gtid]; 1023 1024 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1025 if (!th->th.th_team->t.t_serialized) { 1026 1027 dispatch_private_info_template<UT> *pr = 1028 reinterpret_cast<dispatch_private_info_template<UT> *>( 1029 th->th.th_dispatch->th_dispatch_pr_current); 1030 dispatch_shared_info_template<UT> volatile *sh = 1031 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1032 th->th.th_dispatch->th_dispatch_sh_current); 1033 KMP_DEBUG_ASSERT(pr); 1034 KMP_DEBUG_ASSERT(sh); 1035 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1036 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1037 1038 if (pr->ordered_bumped) { 1039 KD_TRACE( 1040 1000, 1041 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1042 gtid)); 1043 pr->ordered_bumped = 0; 1044 } else { 1045 UT lower = pr->u.p.ordered_lower; 1046 1047 #ifdef KMP_DEBUG 1048 { 1049 char *buff; 1050 // create format specifiers before the debug output 1051 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1052 "ordered_iteration:%%%s lower:%%%s\n", 1053 traits_t<UT>::spec, traits_t<UT>::spec); 1054 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1055 __kmp_str_free(&buff); 1056 } 1057 #endif 1058 1059 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1060 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1061 KMP_MB(); /* is this necessary? */ 1062 #ifdef KMP_DEBUG 1063 { 1064 char *buff; 1065 // create format specifiers before the debug output 1066 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1067 "ordered_iteration:%%%s lower:%%%s\n", 1068 traits_t<UT>::spec, traits_t<UT>::spec); 1069 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1070 __kmp_str_free(&buff); 1071 } 1072 #endif 1073 1074 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1075 } // if 1076 } // if 1077 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1078 } 1079 1080 #ifdef KMP_GOMP_COMPAT 1081 1082 template <typename UT> 1083 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1084 typedef typename traits_t<UT>::signed_t ST; 1085 __kmp_assert_valid_gtid(gtid); 1086 kmp_info_t *th = __kmp_threads[gtid]; 1087 1088 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1089 if (!th->th.th_team->t.t_serialized) { 1090 dispatch_private_info_template<UT> *pr = 1091 reinterpret_cast<dispatch_private_info_template<UT> *>( 1092 th->th.th_dispatch->th_dispatch_pr_current); 1093 dispatch_shared_info_template<UT> volatile *sh = 1094 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1095 th->th.th_dispatch->th_dispatch_sh_current); 1096 KMP_DEBUG_ASSERT(pr); 1097 KMP_DEBUG_ASSERT(sh); 1098 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1099 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1100 1101 UT lower = pr->u.p.ordered_lower; 1102 UT upper = pr->u.p.ordered_upper; 1103 UT inc = upper - lower + 1; 1104 1105 if (pr->ordered_bumped == inc) { 1106 KD_TRACE( 1107 1000, 1108 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1109 gtid)); 1110 pr->ordered_bumped = 0; 1111 } else { 1112 inc -= pr->ordered_bumped; 1113 1114 #ifdef KMP_DEBUG 1115 { 1116 char *buff; 1117 // create format specifiers before the debug output 1118 buff = __kmp_str_format( 1119 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1120 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1121 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1122 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1123 __kmp_str_free(&buff); 1124 } 1125 #endif 1126 1127 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1128 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1129 1130 KMP_MB(); /* is this necessary? */ 1131 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1132 "ordered_bumped to zero\n", 1133 gtid)); 1134 pr->ordered_bumped = 0; 1135 //!!!!! TODO check if the inc should be unsigned, or signed??? 1136 #ifdef KMP_DEBUG 1137 { 1138 char *buff; 1139 // create format specifiers before the debug output 1140 buff = __kmp_str_format( 1141 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1142 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1143 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1144 traits_t<UT>::spec); 1145 KD_TRACE(1000, 1146 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1147 __kmp_str_free(&buff); 1148 } 1149 #endif 1150 1151 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1152 } 1153 // } 1154 } 1155 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1156 } 1157 1158 #endif /* KMP_GOMP_COMPAT */ 1159 1160 template <typename T> 1161 int __kmp_dispatch_next_algorithm(int gtid, 1162 dispatch_private_info_template<T> *pr, 1163 dispatch_shared_info_template<T> volatile *sh, 1164 kmp_int32 *p_last, T *p_lb, T *p_ub, 1165 typename traits_t<T>::signed_t *p_st, T nproc, 1166 T tid) { 1167 typedef typename traits_t<T>::unsigned_t UT; 1168 typedef typename traits_t<T>::signed_t ST; 1169 typedef typename traits_t<T>::floating_t DBL; 1170 int status = 0; 1171 bool last = false; 1172 T start; 1173 ST incr; 1174 UT limit, trip, init; 1175 kmp_info_t *th = __kmp_threads[gtid]; 1176 kmp_team_t *team = th->th.th_team; 1177 1178 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1179 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1180 KMP_DEBUG_ASSERT(pr); 1181 KMP_DEBUG_ASSERT(sh); 1182 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1183 #ifdef KMP_DEBUG 1184 { 1185 char *buff; 1186 // create format specifiers before the debug output 1187 buff = 1188 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1189 "sh:%%p nproc:%%%s tid:%%%s\n", 1190 traits_t<T>::spec, traits_t<T>::spec); 1191 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1192 __kmp_str_free(&buff); 1193 } 1194 #endif 1195 1196 // zero trip count 1197 if (pr->u.p.tc == 0) { 1198 KD_TRACE(10, 1199 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1200 "zero status:%d\n", 1201 gtid, status)); 1202 return 0; 1203 } 1204 1205 switch (pr->schedule) { 1206 #if KMP_STATIC_STEAL_ENABLED 1207 case kmp_sch_static_steal: { 1208 T chunk = pr->u.p.parm1; 1209 UT nchunks = pr->u.p.parm2; 1210 KD_TRACE(100, 1211 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1212 gtid)); 1213 1214 trip = pr->u.p.tc - 1; 1215 1216 if (traits_t<T>::type_size > 4) { 1217 // use lock for 8-byte induction variable. 1218 // TODO (optional): check presence and use 16-byte CAS 1219 kmp_lock_t *lck = pr->u.p.steal_lock; 1220 KMP_DEBUG_ASSERT(lck != NULL); 1221 if (pr->u.p.count < (UT)pr->u.p.ub) { 1222 KMP_DEBUG_ASSERT(pr->steal_flag == READY); 1223 __kmp_acquire_lock(lck, gtid); 1224 // try to get own chunk of iterations 1225 init = (pr->u.p.count)++; 1226 status = (init < (UT)pr->u.p.ub); 1227 __kmp_release_lock(lck, gtid); 1228 } else { 1229 status = 0; // no own chunks 1230 } 1231 if (!status) { // try to steal 1232 kmp_lock_t *lckv; // victim buffer's lock 1233 T while_limit = pr->u.p.parm3; 1234 T while_index = 0; 1235 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1236 __kmp_dispatch_num_buffers; // current loop index 1237 // note: victim thread can potentially execute another loop 1238 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive 1239 while ((!status) && (while_limit != ++while_index)) { 1240 dispatch_private_info_template<T> *v; 1241 T remaining; 1242 T victimId = pr->u.p.parm4; 1243 T oldVictimId = victimId ? victimId - 1 : nproc - 1; 1244 v = reinterpret_cast<dispatch_private_info_template<T> *>( 1245 &team->t.t_dispatch[victimId].th_disp_buffer[idx]); 1246 KMP_DEBUG_ASSERT(v); 1247 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) && 1248 oldVictimId != victimId) { 1249 victimId = (victimId + 1) % nproc; 1250 v = reinterpret_cast<dispatch_private_info_template<T> *>( 1251 &team->t.t_dispatch[victimId].th_disp_buffer[idx]); 1252 KMP_DEBUG_ASSERT(v); 1253 } 1254 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) { 1255 continue; // try once more (nproc attempts in total) 1256 } 1257 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) { 1258 kmp_uint32 old = UNUSED; 1259 // try to steal whole range from inactive victim 1260 status = v->steal_flag.compare_exchange_strong(old, THIEF); 1261 if (status) { 1262 // initialize self buffer with victim's whole range of chunks 1263 T id = victimId; 1264 T small_chunk, extras; 1265 small_chunk = nchunks / nproc; // chunks per thread 1266 extras = nchunks % nproc; 1267 init = id * small_chunk + (id < extras ? id : extras); 1268 __kmp_acquire_lock(lck, gtid); 1269 pr->u.p.count = init + 1; // exclude one we execute immediately 1270 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 1271 __kmp_release_lock(lck, gtid); 1272 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 1273 // no need to reinitialize other thread invariants: lb, st, etc. 1274 #ifdef KMP_DEBUG 1275 { 1276 char *buff; 1277 // create format specifiers before the debug output 1278 buff = __kmp_str_format( 1279 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " 1280 "count:%%%s ub:%%%s\n", 1281 traits_t<UT>::spec, traits_t<T>::spec); 1282 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub)); 1283 __kmp_str_free(&buff); 1284 } 1285 #endif 1286 // activate non-empty buffer and let others steal from us 1287 if (pr->u.p.count < (UT)pr->u.p.ub) 1288 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 1289 break; 1290 } 1291 } 1292 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY || 1293 v->u.p.count >= (UT)v->u.p.ub) { 1294 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid 1295 continue; // no chunks to steal, try next victim 1296 } 1297 lckv = v->u.p.steal_lock; 1298 KMP_ASSERT(lckv != NULL); 1299 __kmp_acquire_lock(lckv, gtid); 1300 limit = v->u.p.ub; // keep initial ub 1301 if (v->u.p.count >= limit) { 1302 __kmp_release_lock(lckv, gtid); 1303 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid 1304 continue; // no chunks to steal, try next victim 1305 } 1306 1307 // stealing succeded, reduce victim's ub by 1/4 of undone chunks 1308 // TODO: is this heuristics good enough?? 1309 remaining = limit - v->u.p.count; 1310 if (remaining > 7) { 1311 // steal 1/4 of remaining 1312 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1313 init = (v->u.p.ub -= (remaining >> 2)); 1314 } else { 1315 // steal 1 chunk of 1..7 remaining 1316 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1317 init = (v->u.p.ub -= 1); 1318 } 1319 __kmp_release_lock(lckv, gtid); 1320 #ifdef KMP_DEBUG 1321 { 1322 char *buff; 1323 // create format specifiers before the debug output 1324 buff = __kmp_str_format( 1325 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " 1326 "count:%%%s ub:%%%s\n", 1327 traits_t<UT>::spec, traits_t<UT>::spec); 1328 KD_TRACE(10, (buff, gtid, victimId, init, limit)); 1329 __kmp_str_free(&buff); 1330 } 1331 #endif 1332 KMP_DEBUG_ASSERT(init + 1 <= limit); 1333 pr->u.p.parm4 = victimId; // remember victim to steal from 1334 status = 1; 1335 // now update own count and ub with stolen range excluding init chunk 1336 __kmp_acquire_lock(lck, gtid); 1337 pr->u.p.count = init + 1; 1338 pr->u.p.ub = limit; 1339 __kmp_release_lock(lck, gtid); 1340 // activate non-empty buffer and let others steal from us 1341 if (init + 1 < limit) 1342 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 1343 } // while (search for victim) 1344 } // if (try to find victim and steal) 1345 } else { 1346 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1347 // as all operations on pair (count, ub) must be done atomically 1348 typedef union { 1349 struct { 1350 UT count; 1351 T ub; 1352 } p; 1353 kmp_int64 b; 1354 } union_i4; 1355 union_i4 vold, vnew; 1356 if (pr->u.p.count < (UT)pr->u.p.ub) { 1357 KMP_DEBUG_ASSERT(pr->steal_flag == READY); 1358 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1359 vnew.b = vold.b; 1360 vnew.p.count++; // get chunk from head of self range 1361 while (!KMP_COMPARE_AND_STORE_REL64( 1362 (volatile kmp_int64 *)&pr->u.p.count, 1363 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1364 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1365 KMP_CPU_PAUSE(); 1366 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1367 vnew.b = vold.b; 1368 vnew.p.count++; 1369 } 1370 init = vold.p.count; 1371 status = (init < (UT)vold.p.ub); 1372 } else { 1373 status = 0; // no own chunks 1374 } 1375 if (!status) { // try to steal 1376 T while_limit = pr->u.p.parm3; 1377 T while_index = 0; 1378 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1379 __kmp_dispatch_num_buffers; // current loop index 1380 // note: victim thread can potentially execute another loop 1381 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive 1382 while ((!status) && (while_limit != ++while_index)) { 1383 dispatch_private_info_template<T> *v; 1384 T remaining; 1385 T victimId = pr->u.p.parm4; 1386 T oldVictimId = victimId ? victimId - 1 : nproc - 1; 1387 v = reinterpret_cast<dispatch_private_info_template<T> *>( 1388 &team->t.t_dispatch[victimId].th_disp_buffer[idx]); 1389 KMP_DEBUG_ASSERT(v); 1390 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) && 1391 oldVictimId != victimId) { 1392 victimId = (victimId + 1) % nproc; 1393 v = reinterpret_cast<dispatch_private_info_template<T> *>( 1394 &team->t.t_dispatch[victimId].th_disp_buffer[idx]); 1395 KMP_DEBUG_ASSERT(v); 1396 } 1397 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) { 1398 continue; // try once more (nproc attempts in total) 1399 } 1400 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) { 1401 kmp_uint32 old = UNUSED; 1402 // try to steal whole range from inactive victim 1403 status = v->steal_flag.compare_exchange_strong(old, THIEF); 1404 if (status) { 1405 // initialize self buffer with victim's whole range of chunks 1406 T id = victimId; 1407 T small_chunk, extras; 1408 small_chunk = nchunks / nproc; // chunks per thread 1409 extras = nchunks % nproc; 1410 init = id * small_chunk + (id < extras ? id : extras); 1411 vnew.p.count = init + 1; 1412 vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0); 1413 // write pair (count, ub) at once atomically 1414 #if KMP_ARCH_X86 1415 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b); 1416 #else 1417 *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b; 1418 #endif 1419 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 1420 // no need to initialize other thread invariants: lb, st, etc. 1421 #ifdef KMP_DEBUG 1422 { 1423 char *buff; 1424 // create format specifiers before the debug output 1425 buff = __kmp_str_format( 1426 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " 1427 "count:%%%s ub:%%%s\n", 1428 traits_t<UT>::spec, traits_t<T>::spec); 1429 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub)); 1430 __kmp_str_free(&buff); 1431 } 1432 #endif 1433 // activate non-empty buffer and let others steal from us 1434 if (pr->u.p.count < (UT)pr->u.p.ub) 1435 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 1436 break; 1437 } 1438 } 1439 while (1) { // CAS loop with check if victim still has enough chunks 1440 // many threads may be stealing concurrently from same victim 1441 vold.b = *(volatile kmp_int64 *)(&v->u.p.count); 1442 if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY || 1443 vold.p.count >= (UT)vold.p.ub) { 1444 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id 1445 break; // no chunks to steal, try next victim 1446 } 1447 vnew.b = vold.b; 1448 remaining = vold.p.ub - vold.p.count; 1449 // try to steal 1/4 of remaining 1450 // TODO: is this heuristics good enough?? 1451 if (remaining > 7) { 1452 vnew.p.ub -= remaining >> 2; // steal from tail of victim's range 1453 } else { 1454 vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining 1455 } 1456 KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip); 1457 if (KMP_COMPARE_AND_STORE_REL64( 1458 (volatile kmp_int64 *)&v->u.p.count, 1459 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1460 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1461 // stealing succedded 1462 #ifdef KMP_DEBUG 1463 { 1464 char *buff; 1465 // create format specifiers before the debug output 1466 buff = __kmp_str_format( 1467 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " 1468 "count:%%%s ub:%%%s\n", 1469 traits_t<T>::spec, traits_t<T>::spec); 1470 KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub)); 1471 __kmp_str_free(&buff); 1472 } 1473 #endif 1474 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1475 vold.p.ub - vnew.p.ub); 1476 status = 1; 1477 pr->u.p.parm4 = victimId; // keep victim id 1478 // now update own count and ub 1479 init = vnew.p.ub; 1480 vold.p.count = init + 1; 1481 #if KMP_ARCH_X86 1482 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1483 #else 1484 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1485 #endif 1486 // activate non-empty buffer and let others steal from us 1487 if (vold.p.count < (UT)vold.p.ub) 1488 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 1489 break; 1490 } // if (check CAS result) 1491 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt 1492 } // while (try to steal from particular victim) 1493 } // while (search for victim) 1494 } // if (try to find victim and steal) 1495 } // if (4-byte induction variable) 1496 if (!status) { 1497 *p_lb = 0; 1498 *p_ub = 0; 1499 if (p_st != NULL) 1500 *p_st = 0; 1501 } else { 1502 start = pr->u.p.lb; 1503 init *= chunk; 1504 limit = chunk + init - 1; 1505 incr = pr->u.p.st; 1506 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1507 1508 KMP_DEBUG_ASSERT(init <= trip); 1509 // keep track of done chunks for possible early exit from stealing 1510 // TODO: count executed chunks locally with rare update of shared location 1511 // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1512 if ((last = (limit >= trip)) != 0) 1513 limit = trip; 1514 if (p_st != NULL) 1515 *p_st = incr; 1516 1517 if (incr == 1) { 1518 *p_lb = start + init; 1519 *p_ub = start + limit; 1520 } else { 1521 *p_lb = start + init * incr; 1522 *p_ub = start + limit * incr; 1523 } 1524 } // if 1525 break; 1526 } // case 1527 #endif // KMP_STATIC_STEAL_ENABLED 1528 case kmp_sch_static_balanced: { 1529 KD_TRACE( 1530 10, 1531 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1532 gtid)); 1533 /* check if thread has any iteration to do */ 1534 if ((status = !pr->u.p.count) != 0) { 1535 pr->u.p.count = 1; 1536 *p_lb = pr->u.p.lb; 1537 *p_ub = pr->u.p.ub; 1538 last = (pr->u.p.parm1 != 0); 1539 if (p_st != NULL) 1540 *p_st = pr->u.p.st; 1541 } else { /* no iterations to do */ 1542 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1543 } 1544 } // case 1545 break; 1546 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1547 merged here */ 1548 case kmp_sch_static_chunked: { 1549 T parm1; 1550 1551 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1552 "kmp_sch_static_[affinity|chunked] case\n", 1553 gtid)); 1554 parm1 = pr->u.p.parm1; 1555 1556 trip = pr->u.p.tc - 1; 1557 init = parm1 * (pr->u.p.count + tid); 1558 1559 if ((status = (init <= trip)) != 0) { 1560 start = pr->u.p.lb; 1561 incr = pr->u.p.st; 1562 limit = parm1 + init - 1; 1563 1564 if ((last = (limit >= trip)) != 0) 1565 limit = trip; 1566 1567 if (p_st != NULL) 1568 *p_st = incr; 1569 1570 pr->u.p.count += nproc; 1571 1572 if (incr == 1) { 1573 *p_lb = start + init; 1574 *p_ub = start + limit; 1575 } else { 1576 *p_lb = start + init * incr; 1577 *p_ub = start + limit * incr; 1578 } 1579 1580 if (pr->flags.ordered) { 1581 pr->u.p.ordered_lower = init; 1582 pr->u.p.ordered_upper = limit; 1583 } // if 1584 } // if 1585 } // case 1586 break; 1587 1588 case kmp_sch_dynamic_chunked: { 1589 UT chunk_number; 1590 UT chunk_size = pr->u.p.parm1; 1591 UT nchunks = pr->u.p.parm2; 1592 1593 KD_TRACE( 1594 100, 1595 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1596 gtid)); 1597 1598 chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1599 status = (chunk_number < nchunks); 1600 if (!status) { 1601 *p_lb = 0; 1602 *p_ub = 0; 1603 if (p_st != NULL) 1604 *p_st = 0; 1605 } else { 1606 init = chunk_size * chunk_number; 1607 trip = pr->u.p.tc - 1; 1608 start = pr->u.p.lb; 1609 incr = pr->u.p.st; 1610 1611 if ((last = (trip - init < (UT)chunk_size))) 1612 limit = trip; 1613 else 1614 limit = chunk_size + init - 1; 1615 1616 if (p_st != NULL) 1617 *p_st = incr; 1618 1619 if (incr == 1) { 1620 *p_lb = start + init; 1621 *p_ub = start + limit; 1622 } else { 1623 *p_lb = start + init * incr; 1624 *p_ub = start + limit * incr; 1625 } 1626 1627 if (pr->flags.ordered) { 1628 pr->u.p.ordered_lower = init; 1629 pr->u.p.ordered_upper = limit; 1630 } // if 1631 } // if 1632 } // case 1633 break; 1634 1635 case kmp_sch_guided_iterative_chunked: { 1636 T chunkspec = pr->u.p.parm1; 1637 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1638 "iterative case\n", 1639 gtid)); 1640 trip = pr->u.p.tc; 1641 // Start atomic part of calculations 1642 while (1) { 1643 ST remaining; // signed, because can be < 0 1644 init = sh->u.s.iteration; // shared value 1645 remaining = trip - init; 1646 if (remaining <= 0) { // AC: need to compare with 0 first 1647 // nothing to do, don't try atomic op 1648 status = 0; 1649 break; 1650 } 1651 if ((T)remaining < 1652 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1653 // use dynamic-style schedule 1654 // atomically increment iterations, get old value 1655 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1656 (ST)chunkspec); 1657 remaining = trip - init; 1658 if (remaining <= 0) { 1659 status = 0; // all iterations got by other threads 1660 } else { 1661 // got some iterations to work on 1662 status = 1; 1663 if ((T)remaining > chunkspec) { 1664 limit = init + chunkspec - 1; 1665 } else { 1666 last = true; // the last chunk 1667 limit = init + remaining - 1; 1668 } // if 1669 } // if 1670 break; 1671 } // if 1672 limit = init + (UT)((double)remaining * 1673 *(double *)&pr->u.p.parm3); // divide by K*nproc 1674 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1675 (ST)init, (ST)limit)) { 1676 // CAS was successful, chunk obtained 1677 status = 1; 1678 --limit; 1679 break; 1680 } // if 1681 } // while 1682 if (status != 0) { 1683 start = pr->u.p.lb; 1684 incr = pr->u.p.st; 1685 if (p_st != NULL) 1686 *p_st = incr; 1687 *p_lb = start + init * incr; 1688 *p_ub = start + limit * incr; 1689 if (pr->flags.ordered) { 1690 pr->u.p.ordered_lower = init; 1691 pr->u.p.ordered_upper = limit; 1692 } // if 1693 } else { 1694 *p_lb = 0; 1695 *p_ub = 0; 1696 if (p_st != NULL) 1697 *p_st = 0; 1698 } // if 1699 } // case 1700 break; 1701 1702 case kmp_sch_guided_simd: { 1703 // same as iterative but curr-chunk adjusted to be multiple of given 1704 // chunk 1705 T chunk = pr->u.p.parm1; 1706 KD_TRACE(100, 1707 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1708 gtid)); 1709 trip = pr->u.p.tc; 1710 // Start atomic part of calculations 1711 while (1) { 1712 ST remaining; // signed, because can be < 0 1713 init = sh->u.s.iteration; // shared value 1714 remaining = trip - init; 1715 if (remaining <= 0) { // AC: need to compare with 0 first 1716 status = 0; // nothing to do, don't try atomic op 1717 break; 1718 } 1719 KMP_DEBUG_ASSERT(chunk && init % chunk == 0); 1720 // compare with K*nproc*(chunk+1), K=2 by default 1721 if ((T)remaining < pr->u.p.parm2) { 1722 // use dynamic-style schedule 1723 // atomically increment iterations, get old value 1724 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1725 (ST)chunk); 1726 remaining = trip - init; 1727 if (remaining <= 0) { 1728 status = 0; // all iterations got by other threads 1729 } else { 1730 // got some iterations to work on 1731 status = 1; 1732 if ((T)remaining > chunk) { 1733 limit = init + chunk - 1; 1734 } else { 1735 last = true; // the last chunk 1736 limit = init + remaining - 1; 1737 } // if 1738 } // if 1739 break; 1740 } // if 1741 // divide by K*nproc 1742 UT span; 1743 __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3), 1744 &span); 1745 UT rem = span % chunk; 1746 if (rem) // adjust so that span%chunk == 0 1747 span += chunk - rem; 1748 limit = init + span; 1749 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1750 (ST)init, (ST)limit)) { 1751 // CAS was successful, chunk obtained 1752 status = 1; 1753 --limit; 1754 break; 1755 } // if 1756 } // while 1757 if (status != 0) { 1758 start = pr->u.p.lb; 1759 incr = pr->u.p.st; 1760 if (p_st != NULL) 1761 *p_st = incr; 1762 *p_lb = start + init * incr; 1763 *p_ub = start + limit * incr; 1764 if (pr->flags.ordered) { 1765 pr->u.p.ordered_lower = init; 1766 pr->u.p.ordered_upper = limit; 1767 } // if 1768 } else { 1769 *p_lb = 0; 1770 *p_ub = 0; 1771 if (p_st != NULL) 1772 *p_st = 0; 1773 } // if 1774 } // case 1775 break; 1776 1777 case kmp_sch_guided_analytical_chunked: { 1778 T chunkspec = pr->u.p.parm1; 1779 UT chunkIdx; 1780 #if KMP_USE_X87CONTROL 1781 /* for storing original FPCW value for Windows* OS on 1782 IA-32 architecture 8-byte version */ 1783 unsigned int oldFpcw; 1784 unsigned int fpcwSet = 0; 1785 #endif 1786 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1787 "kmp_sch_guided_analytical_chunked case\n", 1788 gtid)); 1789 1790 trip = pr->u.p.tc; 1791 1792 KMP_DEBUG_ASSERT(nproc > 1); 1793 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1794 1795 while (1) { /* this while loop is a safeguard against unexpected zero 1796 chunk sizes */ 1797 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1798 if (chunkIdx >= (UT)pr->u.p.parm2) { 1799 --trip; 1800 /* use dynamic-style scheduling */ 1801 init = chunkIdx * chunkspec + pr->u.p.count; 1802 /* need to verify init > 0 in case of overflow in the above 1803 * calculation */ 1804 if ((status = (init > 0 && init <= trip)) != 0) { 1805 limit = init + chunkspec - 1; 1806 1807 if ((last = (limit >= trip)) != 0) 1808 limit = trip; 1809 } 1810 break; 1811 } else { 1812 /* use exponential-style scheduling */ 1813 /* The following check is to workaround the lack of long double precision on 1814 Windows* OS. 1815 This check works around the possible effect that init != 0 for chunkIdx == 0. 1816 */ 1817 #if KMP_USE_X87CONTROL 1818 /* If we haven't already done so, save original 1819 FPCW and set precision to 64-bit, as Windows* OS 1820 on IA-32 architecture defaults to 53-bit */ 1821 if (!fpcwSet) { 1822 oldFpcw = _control87(0, 0); 1823 _control87(_PC_64, _MCW_PC); 1824 fpcwSet = 0x30000; 1825 } 1826 #endif 1827 if (chunkIdx) { 1828 init = __kmp_dispatch_guided_remaining<T>( 1829 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1830 KMP_DEBUG_ASSERT(init); 1831 init = trip - init; 1832 } else 1833 init = 0; 1834 limit = trip - __kmp_dispatch_guided_remaining<T>( 1835 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1836 KMP_ASSERT(init <= limit); 1837 if (init < limit) { 1838 KMP_DEBUG_ASSERT(limit <= trip); 1839 --limit; 1840 status = 1; 1841 break; 1842 } // if 1843 } // if 1844 } // while (1) 1845 #if KMP_USE_X87CONTROL 1846 /* restore FPCW if necessary 1847 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1848 */ 1849 if (fpcwSet && (oldFpcw & fpcwSet)) 1850 _control87(oldFpcw, _MCW_PC); 1851 #endif 1852 if (status != 0) { 1853 start = pr->u.p.lb; 1854 incr = pr->u.p.st; 1855 if (p_st != NULL) 1856 *p_st = incr; 1857 *p_lb = start + init * incr; 1858 *p_ub = start + limit * incr; 1859 if (pr->flags.ordered) { 1860 pr->u.p.ordered_lower = init; 1861 pr->u.p.ordered_upper = limit; 1862 } 1863 } else { 1864 *p_lb = 0; 1865 *p_ub = 0; 1866 if (p_st != NULL) 1867 *p_st = 0; 1868 } 1869 } // case 1870 break; 1871 1872 case kmp_sch_trapezoidal: { 1873 UT index; 1874 T parm2 = pr->u.p.parm2; 1875 T parm3 = pr->u.p.parm3; 1876 T parm4 = pr->u.p.parm4; 1877 KD_TRACE(100, 1878 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1879 gtid)); 1880 1881 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1882 1883 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1884 trip = pr->u.p.tc - 1; 1885 1886 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1887 *p_lb = 0; 1888 *p_ub = 0; 1889 if (p_st != NULL) 1890 *p_st = 0; 1891 } else { 1892 start = pr->u.p.lb; 1893 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1894 incr = pr->u.p.st; 1895 1896 if ((last = (limit >= trip)) != 0) 1897 limit = trip; 1898 1899 if (p_st != NULL) 1900 *p_st = incr; 1901 1902 if (incr == 1) { 1903 *p_lb = start + init; 1904 *p_ub = start + limit; 1905 } else { 1906 *p_lb = start + init * incr; 1907 *p_ub = start + limit * incr; 1908 } 1909 1910 if (pr->flags.ordered) { 1911 pr->u.p.ordered_lower = init; 1912 pr->u.p.ordered_upper = limit; 1913 } // if 1914 } // if 1915 } // case 1916 break; 1917 default: { 1918 status = 0; // to avoid complaints on uninitialized variable use 1919 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1920 KMP_HNT(GetNewerLibrary), // Hint 1921 __kmp_msg_null // Variadic argument list terminator 1922 ); 1923 } break; 1924 } // switch 1925 if (p_last) 1926 *p_last = last; 1927 #ifdef KMP_DEBUG 1928 if (pr->flags.ordered) { 1929 char *buff; 1930 // create format specifiers before the debug output 1931 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1932 "ordered_lower:%%%s ordered_upper:%%%s\n", 1933 traits_t<UT>::spec, traits_t<UT>::spec); 1934 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1935 __kmp_str_free(&buff); 1936 } 1937 { 1938 char *buff; 1939 // create format specifiers before the debug output 1940 buff = __kmp_str_format( 1941 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1942 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1943 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1944 KMP_DEBUG_ASSERT(p_last); 1945 KMP_DEBUG_ASSERT(p_st); 1946 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1947 __kmp_str_free(&buff); 1948 } 1949 #endif 1950 return status; 1951 } 1952 1953 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1954 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1955 is not called. */ 1956 #if OMPT_SUPPORT && OMPT_OPTIONAL 1957 #define OMPT_LOOP_END \ 1958 if (status == 0) { \ 1959 if (ompt_enabled.ompt_callback_work) { \ 1960 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1961 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1962 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1963 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1964 &(task_info->task_data), 0, codeptr); \ 1965 } \ 1966 } 1967 // TODO: implement count 1968 #else 1969 #define OMPT_LOOP_END // no-op 1970 #endif 1971 1972 #if KMP_STATS_ENABLED 1973 #define KMP_STATS_LOOP_END \ 1974 { \ 1975 kmp_int64 u, l, t, i; \ 1976 l = (kmp_int64)(*p_lb); \ 1977 u = (kmp_int64)(*p_ub); \ 1978 i = (kmp_int64)(pr->u.p.st); \ 1979 if (status == 0) { \ 1980 t = 0; \ 1981 KMP_POP_PARTITIONED_TIMER(); \ 1982 } else if (i == 1) { \ 1983 if (u >= l) \ 1984 t = u - l + 1; \ 1985 else \ 1986 t = 0; \ 1987 } else if (i < 0) { \ 1988 if (l >= u) \ 1989 t = (l - u) / (-i) + 1; \ 1990 else \ 1991 t = 0; \ 1992 } else { \ 1993 if (u >= l) \ 1994 t = (u - l) / i + 1; \ 1995 else \ 1996 t = 0; \ 1997 } \ 1998 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1999 } 2000 #else 2001 #define KMP_STATS_LOOP_END /* Nothing */ 2002 #endif 2003 2004 template <typename T> 2005 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 2006 T *p_lb, T *p_ub, 2007 typename traits_t<T>::signed_t *p_st 2008 #if OMPT_SUPPORT && OMPT_OPTIONAL 2009 , 2010 void *codeptr 2011 #endif 2012 ) { 2013 2014 typedef typename traits_t<T>::unsigned_t UT; 2015 typedef typename traits_t<T>::signed_t ST; 2016 // This is potentially slightly misleading, schedule(runtime) will appear here 2017 // even if the actual runtime schedule is static. (Which points out a 2018 // disadvantage of schedule(runtime): even when static scheduling is used it 2019 // costs more than a compile time choice to use static scheduling would.) 2020 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 2021 2022 int status; 2023 dispatch_private_info_template<T> *pr; 2024 __kmp_assert_valid_gtid(gtid); 2025 kmp_info_t *th = __kmp_threads[gtid]; 2026 kmp_team_t *team = th->th.th_team; 2027 2028 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 2029 KD_TRACE( 2030 1000, 2031 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 2032 gtid, p_lb, p_ub, p_st, p_last)); 2033 2034 if (team->t.t_serialized) { 2035 /* NOTE: serialize this dispatch because we are not at the active level */ 2036 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2037 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 2038 KMP_DEBUG_ASSERT(pr); 2039 2040 if ((status = (pr->u.p.tc != 0)) == 0) { 2041 *p_lb = 0; 2042 *p_ub = 0; 2043 // if ( p_last != NULL ) 2044 // *p_last = 0; 2045 if (p_st != NULL) 2046 *p_st = 0; 2047 if (__kmp_env_consistency_check) { 2048 if (pr->pushed_ws != ct_none) { 2049 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2050 } 2051 } 2052 } else if (pr->flags.nomerge) { 2053 kmp_int32 last; 2054 T start; 2055 UT limit, trip, init; 2056 ST incr; 2057 T chunk = pr->u.p.parm1; 2058 2059 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 2060 gtid)); 2061 2062 init = chunk * pr->u.p.count++; 2063 trip = pr->u.p.tc - 1; 2064 2065 if ((status = (init <= trip)) == 0) { 2066 *p_lb = 0; 2067 *p_ub = 0; 2068 // if ( p_last != NULL ) 2069 // *p_last = 0; 2070 if (p_st != NULL) 2071 *p_st = 0; 2072 if (__kmp_env_consistency_check) { 2073 if (pr->pushed_ws != ct_none) { 2074 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2075 } 2076 } 2077 } else { 2078 start = pr->u.p.lb; 2079 limit = chunk + init - 1; 2080 incr = pr->u.p.st; 2081 2082 if ((last = (limit >= trip)) != 0) { 2083 limit = trip; 2084 #if KMP_OS_WINDOWS 2085 pr->u.p.last_upper = pr->u.p.ub; 2086 #endif /* KMP_OS_WINDOWS */ 2087 } 2088 if (p_last != NULL) 2089 *p_last = last; 2090 if (p_st != NULL) 2091 *p_st = incr; 2092 if (incr == 1) { 2093 *p_lb = start + init; 2094 *p_ub = start + limit; 2095 } else { 2096 *p_lb = start + init * incr; 2097 *p_ub = start + limit * incr; 2098 } 2099 2100 if (pr->flags.ordered) { 2101 pr->u.p.ordered_lower = init; 2102 pr->u.p.ordered_upper = limit; 2103 #ifdef KMP_DEBUG 2104 { 2105 char *buff; 2106 // create format specifiers before the debug output 2107 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2108 "ordered_lower:%%%s ordered_upper:%%%s\n", 2109 traits_t<UT>::spec, traits_t<UT>::spec); 2110 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2111 pr->u.p.ordered_upper)); 2112 __kmp_str_free(&buff); 2113 } 2114 #endif 2115 } // if 2116 } // if 2117 } else { 2118 pr->u.p.tc = 0; 2119 *p_lb = pr->u.p.lb; 2120 *p_ub = pr->u.p.ub; 2121 #if KMP_OS_WINDOWS 2122 pr->u.p.last_upper = *p_ub; 2123 #endif /* KMP_OS_WINDOWS */ 2124 if (p_last != NULL) 2125 *p_last = TRUE; 2126 if (p_st != NULL) 2127 *p_st = pr->u.p.st; 2128 } // if 2129 #ifdef KMP_DEBUG 2130 { 2131 char *buff; 2132 // create format specifiers before the debug output 2133 buff = __kmp_str_format( 2134 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 2135 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 2136 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2137 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, 2138 (p_last ? *p_last : 0), status)); 2139 __kmp_str_free(&buff); 2140 } 2141 #endif 2142 #if INCLUDE_SSC_MARKS 2143 SSC_MARK_DISPATCH_NEXT(); 2144 #endif 2145 OMPT_LOOP_END; 2146 KMP_STATS_LOOP_END; 2147 return status; 2148 } else { 2149 kmp_int32 last = 0; 2150 dispatch_shared_info_template<T> volatile *sh; 2151 2152 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2153 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2154 2155 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2156 th->th.th_dispatch->th_dispatch_pr_current); 2157 KMP_DEBUG_ASSERT(pr); 2158 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2159 th->th.th_dispatch->th_dispatch_sh_current); 2160 KMP_DEBUG_ASSERT(sh); 2161 2162 #if KMP_USE_HIER_SCHED 2163 if (pr->flags.use_hier) 2164 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2165 else 2166 #endif // KMP_USE_HIER_SCHED 2167 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2168 p_st, th->th.th_team_nproc, 2169 th->th.th_info.ds.ds_tid); 2170 // status == 0: no more iterations to execute 2171 if (status == 0) { 2172 ST num_done; 2173 num_done = test_then_inc<ST>(&sh->u.s.num_done); 2174 #ifdef KMP_DEBUG 2175 { 2176 char *buff; 2177 // create format specifiers before the debug output 2178 buff = __kmp_str_format( 2179 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2180 traits_t<ST>::spec); 2181 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2182 __kmp_str_free(&buff); 2183 } 2184 #endif 2185 2186 #if KMP_USE_HIER_SCHED 2187 pr->flags.use_hier = FALSE; 2188 #endif 2189 if (num_done == th->th.th_team_nproc - 1) { 2190 #if KMP_STATIC_STEAL_ENABLED 2191 if (pr->schedule == kmp_sch_static_steal) { 2192 int i; 2193 int idx = (th->th.th_dispatch->th_disp_index - 1) % 2194 __kmp_dispatch_num_buffers; // current loop index 2195 // loop complete, safe to destroy locks used for stealing 2196 for (i = 0; i < th->th.th_team_nproc; ++i) { 2197 dispatch_private_info_template<T> *buf = 2198 reinterpret_cast<dispatch_private_info_template<T> *>( 2199 &team->t.t_dispatch[i].th_disp_buffer[idx]); 2200 KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive 2201 KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED); 2202 if (traits_t<T>::type_size > 4) { 2203 // destroy locks used for stealing 2204 kmp_lock_t *lck = buf->u.p.steal_lock; 2205 KMP_ASSERT(lck != NULL); 2206 __kmp_destroy_lock(lck); 2207 __kmp_free(lck); 2208 buf->u.p.steal_lock = NULL; 2209 } 2210 } 2211 } 2212 #endif 2213 /* NOTE: release shared buffer to be reused */ 2214 2215 KMP_MB(); /* Flush all pending memory write invalidates. */ 2216 2217 sh->u.s.num_done = 0; 2218 sh->u.s.iteration = 0; 2219 2220 /* TODO replace with general release procedure? */ 2221 if (pr->flags.ordered) { 2222 sh->u.s.ordered_iteration = 0; 2223 } 2224 2225 sh->buffer_index += __kmp_dispatch_num_buffers; 2226 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2227 gtid, sh->buffer_index)); 2228 2229 KMP_MB(); /* Flush all pending memory write invalidates. */ 2230 2231 } // if 2232 if (__kmp_env_consistency_check) { 2233 if (pr->pushed_ws != ct_none) { 2234 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2235 } 2236 } 2237 2238 th->th.th_dispatch->th_deo_fcn = NULL; 2239 th->th.th_dispatch->th_dxo_fcn = NULL; 2240 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2241 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2242 } // if (status == 0) 2243 #if KMP_OS_WINDOWS 2244 else if (last) { 2245 pr->u.p.last_upper = pr->u.p.ub; 2246 } 2247 #endif /* KMP_OS_WINDOWS */ 2248 if (p_last != NULL && status != 0) 2249 *p_last = last; 2250 } // if 2251 2252 #ifdef KMP_DEBUG 2253 { 2254 char *buff; 2255 // create format specifiers before the debug output 2256 buff = __kmp_str_format( 2257 "__kmp_dispatch_next: T#%%d normal case: " 2258 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2259 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2260 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2261 (p_last ? *p_last : 0), status)); 2262 __kmp_str_free(&buff); 2263 } 2264 #endif 2265 #if INCLUDE_SSC_MARKS 2266 SSC_MARK_DISPATCH_NEXT(); 2267 #endif 2268 OMPT_LOOP_END; 2269 KMP_STATS_LOOP_END; 2270 return status; 2271 } 2272 2273 template <typename T> 2274 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2275 kmp_int32 *plastiter, T *plower, T *pupper, 2276 typename traits_t<T>::signed_t incr) { 2277 typedef typename traits_t<T>::unsigned_t UT; 2278 kmp_uint32 team_id; 2279 kmp_uint32 nteams; 2280 UT trip_count; 2281 kmp_team_t *team; 2282 kmp_info_t *th; 2283 2284 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2285 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2286 #ifdef KMP_DEBUG 2287 typedef typename traits_t<T>::signed_t ST; 2288 { 2289 char *buff; 2290 // create format specifiers before the debug output 2291 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2292 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2293 traits_t<T>::spec, traits_t<T>::spec, 2294 traits_t<ST>::spec, traits_t<T>::spec); 2295 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2296 __kmp_str_free(&buff); 2297 } 2298 #endif 2299 2300 if (__kmp_env_consistency_check) { 2301 if (incr == 0) { 2302 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2303 loc); 2304 } 2305 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2306 // The loop is illegal. 2307 // Some zero-trip loops maintained by compiler, e.g.: 2308 // for(i=10;i<0;++i) // lower >= upper - run-time check 2309 // for(i=0;i>10;--i) // lower <= upper - run-time check 2310 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2311 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2312 // Compiler does not check the following illegal loops: 2313 // for(i=0;i<10;i+=incr) // where incr<0 2314 // for(i=10;i>0;i-=incr) // where incr<0 2315 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2316 } 2317 } 2318 __kmp_assert_valid_gtid(gtid); 2319 th = __kmp_threads[gtid]; 2320 team = th->th.th_team; 2321 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2322 nteams = th->th.th_teams_size.nteams; 2323 team_id = team->t.t_master_tid; 2324 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2325 2326 // compute global trip count 2327 if (incr == 1) { 2328 trip_count = *pupper - *plower + 1; 2329 } else if (incr == -1) { 2330 trip_count = *plower - *pupper + 1; 2331 } else if (incr > 0) { 2332 // upper-lower can exceed the limit of signed type 2333 trip_count = (UT)(*pupper - *plower) / incr + 1; 2334 } else { 2335 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2336 } 2337 2338 if (trip_count <= nteams) { 2339 KMP_DEBUG_ASSERT( 2340 __kmp_static == kmp_sch_static_greedy || 2341 __kmp_static == 2342 kmp_sch_static_balanced); // Unknown static scheduling type. 2343 // only some teams get single iteration, others get nothing 2344 if (team_id < trip_count) { 2345 *pupper = *plower = *plower + team_id * incr; 2346 } else { 2347 *plower = *pupper + incr; // zero-trip loop 2348 } 2349 if (plastiter != NULL) 2350 *plastiter = (team_id == trip_count - 1); 2351 } else { 2352 if (__kmp_static == kmp_sch_static_balanced) { 2353 UT chunk = trip_count / nteams; 2354 UT extras = trip_count % nteams; 2355 *plower += 2356 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2357 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2358 if (plastiter != NULL) 2359 *plastiter = (team_id == nteams - 1); 2360 } else { 2361 T chunk_inc_count = 2362 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2363 T upper = *pupper; 2364 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2365 // Unknown static scheduling type. 2366 *plower += team_id * chunk_inc_count; 2367 *pupper = *plower + chunk_inc_count - incr; 2368 // Check/correct bounds if needed 2369 if (incr > 0) { 2370 if (*pupper < *plower) 2371 *pupper = traits_t<T>::max_value; 2372 if (plastiter != NULL) 2373 *plastiter = *plower <= upper && *pupper > upper - incr; 2374 if (*pupper > upper) 2375 *pupper = upper; // tracker C73258 2376 } else { 2377 if (*pupper > *plower) 2378 *pupper = traits_t<T>::min_value; 2379 if (plastiter != NULL) 2380 *plastiter = *plower >= upper && *pupper < upper - incr; 2381 if (*pupper < upper) 2382 *pupper = upper; // tracker C73258 2383 } 2384 } 2385 } 2386 } 2387 2388 //----------------------------------------------------------------------------- 2389 // Dispatch routines 2390 // Transfer call to template< type T > 2391 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2392 // T lb, T ub, ST st, ST chunk ) 2393 extern "C" { 2394 2395 /*! 2396 @ingroup WORK_SHARING 2397 @{ 2398 @param loc Source location 2399 @param gtid Global thread id 2400 @param schedule Schedule type 2401 @param lb Lower bound 2402 @param ub Upper bound 2403 @param st Step (or increment if you prefer) 2404 @param chunk The chunk size to block with 2405 2406 This function prepares the runtime to start a dynamically scheduled for loop, 2407 saving the loop arguments. 2408 These functions are all identical apart from the types of the arguments. 2409 */ 2410 2411 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2412 enum sched_type schedule, kmp_int32 lb, 2413 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2414 KMP_DEBUG_ASSERT(__kmp_init_serial); 2415 #if OMPT_SUPPORT && OMPT_OPTIONAL 2416 OMPT_STORE_RETURN_ADDRESS(gtid); 2417 #endif 2418 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2419 } 2420 /*! 2421 See @ref __kmpc_dispatch_init_4 2422 */ 2423 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2424 enum sched_type schedule, kmp_uint32 lb, 2425 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2426 KMP_DEBUG_ASSERT(__kmp_init_serial); 2427 #if OMPT_SUPPORT && OMPT_OPTIONAL 2428 OMPT_STORE_RETURN_ADDRESS(gtid); 2429 #endif 2430 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2431 } 2432 2433 /*! 2434 See @ref __kmpc_dispatch_init_4 2435 */ 2436 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2437 enum sched_type schedule, kmp_int64 lb, 2438 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2439 KMP_DEBUG_ASSERT(__kmp_init_serial); 2440 #if OMPT_SUPPORT && OMPT_OPTIONAL 2441 OMPT_STORE_RETURN_ADDRESS(gtid); 2442 #endif 2443 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2444 } 2445 2446 /*! 2447 See @ref __kmpc_dispatch_init_4 2448 */ 2449 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2450 enum sched_type schedule, kmp_uint64 lb, 2451 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2452 KMP_DEBUG_ASSERT(__kmp_init_serial); 2453 #if OMPT_SUPPORT && OMPT_OPTIONAL 2454 OMPT_STORE_RETURN_ADDRESS(gtid); 2455 #endif 2456 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2457 } 2458 2459 /*! 2460 See @ref __kmpc_dispatch_init_4 2461 2462 Difference from __kmpc_dispatch_init set of functions is these functions 2463 are called for composite distribute parallel for construct. Thus before 2464 regular iterations dispatching we need to calc per-team iteration space. 2465 2466 These functions are all identical apart from the types of the arguments. 2467 */ 2468 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2469 enum sched_type schedule, kmp_int32 *p_last, 2470 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2471 kmp_int32 chunk) { 2472 KMP_DEBUG_ASSERT(__kmp_init_serial); 2473 #if OMPT_SUPPORT && OMPT_OPTIONAL 2474 OMPT_STORE_RETURN_ADDRESS(gtid); 2475 #endif 2476 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2477 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2478 } 2479 2480 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2481 enum sched_type schedule, kmp_int32 *p_last, 2482 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2483 kmp_int32 chunk) { 2484 KMP_DEBUG_ASSERT(__kmp_init_serial); 2485 #if OMPT_SUPPORT && OMPT_OPTIONAL 2486 OMPT_STORE_RETURN_ADDRESS(gtid); 2487 #endif 2488 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2489 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2490 } 2491 2492 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2493 enum sched_type schedule, kmp_int32 *p_last, 2494 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2495 kmp_int64 chunk) { 2496 KMP_DEBUG_ASSERT(__kmp_init_serial); 2497 #if OMPT_SUPPORT && OMPT_OPTIONAL 2498 OMPT_STORE_RETURN_ADDRESS(gtid); 2499 #endif 2500 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2501 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2502 } 2503 2504 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2505 enum sched_type schedule, kmp_int32 *p_last, 2506 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2507 kmp_int64 chunk) { 2508 KMP_DEBUG_ASSERT(__kmp_init_serial); 2509 #if OMPT_SUPPORT && OMPT_OPTIONAL 2510 OMPT_STORE_RETURN_ADDRESS(gtid); 2511 #endif 2512 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2513 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2514 } 2515 2516 /*! 2517 @param loc Source code location 2518 @param gtid Global thread id 2519 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2520 otherwise 2521 @param p_lb Pointer to the lower bound for the next chunk of work 2522 @param p_ub Pointer to the upper bound for the next chunk of work 2523 @param p_st Pointer to the stride for the next chunk of work 2524 @return one if there is work to be done, zero otherwise 2525 2526 Get the next dynamically allocated chunk of work for this thread. 2527 If there is no more work, then the lb,ub and stride need not be modified. 2528 */ 2529 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2530 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2531 #if OMPT_SUPPORT && OMPT_OPTIONAL 2532 OMPT_STORE_RETURN_ADDRESS(gtid); 2533 #endif 2534 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2535 #if OMPT_SUPPORT && OMPT_OPTIONAL 2536 , 2537 OMPT_LOAD_RETURN_ADDRESS(gtid) 2538 #endif 2539 ); 2540 } 2541 2542 /*! 2543 See @ref __kmpc_dispatch_next_4 2544 */ 2545 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2546 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2547 kmp_int32 *p_st) { 2548 #if OMPT_SUPPORT && OMPT_OPTIONAL 2549 OMPT_STORE_RETURN_ADDRESS(gtid); 2550 #endif 2551 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2552 #if OMPT_SUPPORT && OMPT_OPTIONAL 2553 , 2554 OMPT_LOAD_RETURN_ADDRESS(gtid) 2555 #endif 2556 ); 2557 } 2558 2559 /*! 2560 See @ref __kmpc_dispatch_next_4 2561 */ 2562 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2563 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2564 #if OMPT_SUPPORT && OMPT_OPTIONAL 2565 OMPT_STORE_RETURN_ADDRESS(gtid); 2566 #endif 2567 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2568 #if OMPT_SUPPORT && OMPT_OPTIONAL 2569 , 2570 OMPT_LOAD_RETURN_ADDRESS(gtid) 2571 #endif 2572 ); 2573 } 2574 2575 /*! 2576 See @ref __kmpc_dispatch_next_4 2577 */ 2578 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2579 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2580 kmp_int64 *p_st) { 2581 #if OMPT_SUPPORT && OMPT_OPTIONAL 2582 OMPT_STORE_RETURN_ADDRESS(gtid); 2583 #endif 2584 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2585 #if OMPT_SUPPORT && OMPT_OPTIONAL 2586 , 2587 OMPT_LOAD_RETURN_ADDRESS(gtid) 2588 #endif 2589 ); 2590 } 2591 2592 /*! 2593 @param loc Source code location 2594 @param gtid Global thread id 2595 2596 Mark the end of a dynamic loop. 2597 */ 2598 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2599 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2600 } 2601 2602 /*! 2603 See @ref __kmpc_dispatch_fini_4 2604 */ 2605 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2606 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2607 } 2608 2609 /*! 2610 See @ref __kmpc_dispatch_fini_4 2611 */ 2612 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2613 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2614 } 2615 2616 /*! 2617 See @ref __kmpc_dispatch_fini_4 2618 */ 2619 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2620 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2621 } 2622 /*! @} */ 2623 2624 //----------------------------------------------------------------------------- 2625 // Non-template routines from kmp_dispatch.cpp used in other sources 2626 2627 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2628 return value == checker; 2629 } 2630 2631 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2632 return value != checker; 2633 } 2634 2635 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2636 return value < checker; 2637 } 2638 2639 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2640 return value >= checker; 2641 } 2642 2643 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2644 return value <= checker; 2645 } 2646 2647 kmp_uint32 2648 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2649 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2650 void *obj // Higher-level synchronization object, or NULL. 2651 ) { 2652 // note: we may not belong to a team at this point 2653 volatile kmp_uint32 *spin = spinner; 2654 kmp_uint32 check = checker; 2655 kmp_uint32 spins; 2656 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2657 kmp_uint32 r; 2658 kmp_uint64 time; 2659 2660 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2661 KMP_INIT_YIELD(spins); 2662 KMP_INIT_BACKOFF(time); 2663 // main wait spin loop 2664 while (!f(r = TCR_4(*spin), check)) { 2665 KMP_FSYNC_SPIN_PREPARE(obj); 2666 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2667 split. It causes problems with infinite recursion because of exit lock */ 2668 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2669 __kmp_abort_thread(); */ 2670 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); 2671 } 2672 KMP_FSYNC_SPIN_ACQUIRED(obj); 2673 return r; 2674 } 2675 2676 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, 2677 kmp_uint32 (*pred)(void *, kmp_uint32), 2678 void *obj // Higher-level synchronization object, or NULL. 2679 ) { 2680 // note: we may not belong to a team at this point 2681 void *spin = spinner; 2682 kmp_uint32 check = checker; 2683 kmp_uint32 spins; 2684 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2685 kmp_uint64 time; 2686 2687 KMP_FSYNC_SPIN_INIT(obj, spin); 2688 KMP_INIT_YIELD(spins); 2689 KMP_INIT_BACKOFF(time); 2690 // main wait spin loop 2691 while (!f(spin, check)) { 2692 KMP_FSYNC_SPIN_PREPARE(obj); 2693 /* if we have waited a bit, or are noversubscribed, yield */ 2694 /* pause is in the following code */ 2695 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); 2696 } 2697 KMP_FSYNC_SPIN_ACQUIRED(obj); 2698 } 2699 2700 } // extern "C" 2701 2702 #ifdef KMP_GOMP_COMPAT 2703 2704 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2705 enum sched_type schedule, kmp_int32 lb, 2706 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2707 int push_ws) { 2708 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2709 push_ws); 2710 } 2711 2712 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2713 enum sched_type schedule, kmp_uint32 lb, 2714 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2715 int push_ws) { 2716 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2717 push_ws); 2718 } 2719 2720 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2721 enum sched_type schedule, kmp_int64 lb, 2722 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2723 int push_ws) { 2724 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2725 push_ws); 2726 } 2727 2728 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2729 enum sched_type schedule, kmp_uint64 lb, 2730 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2731 int push_ws) { 2732 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2733 push_ws); 2734 } 2735 2736 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2737 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2738 } 2739 2740 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2741 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2742 } 2743 2744 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2745 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2746 } 2747 2748 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2749 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2750 } 2751 2752 #endif /* KMP_GOMP_COMPAT */ 2753 2754 /* ------------------------------------------------------------------------ */ 2755