1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 /* Dynamic scheduling initialization and dispatch. 14 * 15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 16 * it may change values between parallel regions. __kmp_max_nth 17 * is the largest value __kmp_nth may take, 1 is the smallest. 18 */ 19 20 #include "kmp.h" 21 #include "kmp_error.h" 22 #include "kmp_i18n.h" 23 #include "kmp_itt.h" 24 #include "kmp_stats.h" 25 #include "kmp_str.h" 26 #if KMP_USE_X87CONTROL 27 #include <float.h> 28 #endif 29 #include "kmp_lock.h" 30 #include "kmp_dispatch.h" 31 #if KMP_USE_HIER_SCHED 32 #include "kmp_dispatch_hier.h" 33 #endif 34 35 #if OMPT_SUPPORT 36 #include "ompt-specific.h" 37 #endif 38 39 /* ------------------------------------------------------------------------ */ 40 /* ------------------------------------------------------------------------ */ 41 42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 43 kmp_info_t *th; 44 45 KMP_DEBUG_ASSERT(gtid_ref); 46 47 if (__kmp_env_consistency_check) { 48 th = __kmp_threads[*gtid_ref]; 49 if (th->th.th_root->r.r_active && 50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 51 #if KMP_USE_DYNAMIC_LOCK 52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 53 #else 54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 55 #endif 56 } 57 } 58 } 59 60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 61 kmp_info_t *th; 62 63 if (__kmp_env_consistency_check) { 64 th = __kmp_threads[*gtid_ref]; 65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 67 } 68 } 69 } 70 71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC 72 static inline int __kmp_get_monotonicity(enum sched_type schedule, 73 bool use_hier = false) { 74 // Pick up the nonmonotonic/monotonic bits from the scheduling type 75 int monotonicity; 76 // default to monotonic 77 monotonicity = SCHEDULE_MONOTONIC; 78 if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 79 monotonicity = SCHEDULE_NONMONOTONIC; 80 else if (SCHEDULE_HAS_MONOTONIC(schedule)) 81 monotonicity = SCHEDULE_MONOTONIC; 82 return monotonicity; 83 } 84 85 // Initialize a dispatch_private_info_template<T> buffer for a particular 86 // type of schedule,chunk. The loop description is found in lb (lower bound), 87 // ub (upper bound), and st (stride). nproc is the number of threads relevant 88 // to the scheduling (often the number of threads in a team, but not always if 89 // hierarchical scheduling is used). tid is the id of the thread calling 90 // the function within the group of nproc threads. It will have a value 91 // between 0 and nproc - 1. This is often just the thread id within a team, but 92 // is not necessarily the case when using hierarchical scheduling. 93 // loc is the source file location of the corresponding loop 94 // gtid is the global thread id 95 template <typename T> 96 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 97 dispatch_private_info_template<T> *pr, 98 enum sched_type schedule, T lb, T ub, 99 typename traits_t<T>::signed_t st, 100 #if USE_ITT_BUILD 101 kmp_uint64 *cur_chunk, 102 #endif 103 typename traits_t<T>::signed_t chunk, 104 T nproc, T tid) { 105 typedef typename traits_t<T>::unsigned_t UT; 106 typedef typename traits_t<T>::floating_t DBL; 107 108 int active; 109 T tc; 110 kmp_info_t *th; 111 kmp_team_t *team; 112 int monotonicity; 113 bool use_hier; 114 115 #ifdef KMP_DEBUG 116 typedef typename traits_t<T>::signed_t ST; 117 { 118 char *buff; 119 // create format specifiers before the debug output 120 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 121 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 122 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 123 traits_t<T>::spec, traits_t<T>::spec, 124 traits_t<ST>::spec, traits_t<ST>::spec, 125 traits_t<T>::spec, traits_t<T>::spec); 126 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 127 __kmp_str_free(&buff); 128 } 129 #endif 130 /* setup data */ 131 th = __kmp_threads[gtid]; 132 team = th->th.th_team; 133 active = !team->t.t_serialized; 134 135 #if USE_ITT_BUILD 136 int itt_need_metadata_reporting = 137 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 138 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 139 team->t.t_active_level == 1; 140 #endif 141 142 #if KMP_USE_HIER_SCHED 143 use_hier = pr->flags.use_hier; 144 #else 145 use_hier = false; 146 #endif 147 148 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */ 149 monotonicity = __kmp_get_monotonicity(schedule, use_hier); 150 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 151 152 /* Pick up the nomerge/ordered bits from the scheduling type */ 153 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 154 pr->flags.nomerge = TRUE; 155 schedule = 156 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 157 } else { 158 pr->flags.nomerge = FALSE; 159 } 160 pr->type_size = traits_t<T>::type_size; // remember the size of variables 161 if (kmp_ord_lower & schedule) { 162 pr->flags.ordered = TRUE; 163 schedule = 164 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 165 } else { 166 pr->flags.ordered = FALSE; 167 } 168 // Ordered overrides nonmonotonic 169 if (pr->flags.ordered) { 170 monotonicity = SCHEDULE_MONOTONIC; 171 } 172 173 if (schedule == kmp_sch_static) { 174 schedule = __kmp_static; 175 } else { 176 if (schedule == kmp_sch_runtime) { 177 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 178 // not specified) 179 schedule = team->t.t_sched.r_sched_type; 180 monotonicity = __kmp_get_monotonicity(schedule, use_hier); 181 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 182 // Detail the schedule if needed (global controls are differentiated 183 // appropriately) 184 if (schedule == kmp_sch_guided_chunked) { 185 schedule = __kmp_guided; 186 } else if (schedule == kmp_sch_static) { 187 schedule = __kmp_static; 188 } 189 // Use the chunk size specified by OMP_SCHEDULE (or default if not 190 // specified) 191 chunk = team->t.t_sched.chunk; 192 #if USE_ITT_BUILD 193 if (cur_chunk) 194 *cur_chunk = chunk; 195 #endif 196 #ifdef KMP_DEBUG 197 { 198 char *buff; 199 // create format specifiers before the debug output 200 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 201 "schedule:%%d chunk:%%%s\n", 202 traits_t<ST>::spec); 203 KD_TRACE(10, (buff, gtid, schedule, chunk)); 204 __kmp_str_free(&buff); 205 } 206 #endif 207 } else { 208 if (schedule == kmp_sch_guided_chunked) { 209 schedule = __kmp_guided; 210 } 211 if (chunk <= 0) { 212 chunk = KMP_DEFAULT_CHUNK; 213 } 214 } 215 216 if (schedule == kmp_sch_auto) { 217 // mapping and differentiation: in the __kmp_do_serial_initialize() 218 schedule = __kmp_auto; 219 #ifdef KMP_DEBUG 220 { 221 char *buff; 222 // create format specifiers before the debug output 223 buff = __kmp_str_format( 224 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 225 "schedule:%%d chunk:%%%s\n", 226 traits_t<ST>::spec); 227 KD_TRACE(10, (buff, gtid, schedule, chunk)); 228 __kmp_str_free(&buff); 229 } 230 #endif 231 } 232 #if KMP_STATIC_STEAL_ENABLED 233 // map nonmonotonic:dynamic to static steal 234 if (schedule == kmp_sch_dynamic_chunked) { 235 if (monotonicity == SCHEDULE_NONMONOTONIC) 236 schedule = kmp_sch_static_steal; 237 } 238 #endif 239 /* guided analytical not safe for too many threads */ 240 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 241 schedule = kmp_sch_guided_iterative_chunked; 242 KMP_WARNING(DispatchManyThreads); 243 } 244 if (schedule == kmp_sch_runtime_simd) { 245 // compiler provides simd_width in the chunk parameter 246 schedule = team->t.t_sched.r_sched_type; 247 monotonicity = __kmp_get_monotonicity(schedule, use_hier); 248 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 249 // Detail the schedule if needed (global controls are differentiated 250 // appropriately) 251 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 252 schedule == __kmp_static) { 253 schedule = kmp_sch_static_balanced_chunked; 254 } else { 255 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 256 schedule = kmp_sch_guided_simd; 257 } 258 chunk = team->t.t_sched.chunk * chunk; 259 } 260 #if USE_ITT_BUILD 261 if (cur_chunk) 262 *cur_chunk = chunk; 263 #endif 264 #ifdef KMP_DEBUG 265 { 266 char *buff; 267 // create format specifiers before the debug output 268 buff = __kmp_str_format( 269 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d" 270 " chunk:%%%s\n", 271 traits_t<ST>::spec); 272 KD_TRACE(10, (buff, gtid, schedule, chunk)); 273 __kmp_str_free(&buff); 274 } 275 #endif 276 } 277 pr->u.p.parm1 = chunk; 278 } 279 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 280 "unknown scheduling type"); 281 282 pr->u.p.count = 0; 283 284 if (__kmp_env_consistency_check) { 285 if (st == 0) { 286 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 287 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 288 } 289 } 290 // compute trip count 291 if (st == 1) { // most common case 292 if (ub >= lb) { 293 tc = ub - lb + 1; 294 } else { // ub < lb 295 tc = 0; // zero-trip 296 } 297 } else if (st < 0) { 298 if (lb >= ub) { 299 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 300 // where the division needs to be unsigned regardless of the result type 301 tc = (UT)(lb - ub) / (-st) + 1; 302 } else { // lb < ub 303 tc = 0; // zero-trip 304 } 305 } else { // st > 0 306 if (ub >= lb) { 307 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 308 // where the division needs to be unsigned regardless of the result type 309 tc = (UT)(ub - lb) / st + 1; 310 } else { // ub < lb 311 tc = 0; // zero-trip 312 } 313 } 314 315 #if KMP_STATS_ENABLED 316 if (KMP_MASTER_GTID(gtid)) { 317 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc); 318 } 319 #endif 320 321 pr->u.p.lb = lb; 322 pr->u.p.ub = ub; 323 pr->u.p.st = st; 324 pr->u.p.tc = tc; 325 326 #if KMP_OS_WINDOWS 327 pr->u.p.last_upper = ub + st; 328 #endif /* KMP_OS_WINDOWS */ 329 330 /* NOTE: only the active parallel region(s) has active ordered sections */ 331 332 if (active) { 333 if (pr->flags.ordered) { 334 pr->ordered_bumped = 0; 335 pr->u.p.ordered_lower = 1; 336 pr->u.p.ordered_upper = 0; 337 } 338 } 339 340 switch (schedule) { 341 #if (KMP_STATIC_STEAL_ENABLED) 342 case kmp_sch_static_steal: { 343 T ntc, init; 344 345 KD_TRACE(100, 346 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 347 gtid)); 348 349 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 350 if (nproc > 1 && ntc >= nproc) { 351 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 352 T id = tid; 353 T small_chunk, extras; 354 355 small_chunk = ntc / nproc; 356 extras = ntc % nproc; 357 358 init = id * small_chunk + (id < extras ? id : extras); 359 pr->u.p.count = init; 360 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 361 362 pr->u.p.parm2 = lb; 363 // parm3 is the number of times to attempt stealing which is 364 // proportional to the number of chunks per thread up until 365 // the maximum value of nproc. 366 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc); 367 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 368 pr->u.p.st = st; 369 if (traits_t<T>::type_size > 4) { 370 // AC: TODO: check if 16-byte CAS available and use it to 371 // improve performance (probably wait for explicit request 372 // before spending time on this). 373 // For now use dynamically allocated per-thread lock, 374 // free memory in __kmp_dispatch_next when status==0. 375 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 376 th->th.th_dispatch->th_steal_lock = 377 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 378 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 379 } 380 break; 381 } else { 382 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 383 "kmp_sch_static_balanced\n", 384 gtid)); 385 schedule = kmp_sch_static_balanced; 386 /* too few iterations: fall-through to kmp_sch_static_balanced */ 387 } // if 388 /* FALL-THROUGH to static balanced */ 389 KMP_FALLTHROUGH(); 390 } // case 391 #endif 392 case kmp_sch_static_balanced: { 393 T init, limit; 394 395 KD_TRACE( 396 100, 397 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 398 gtid)); 399 400 if (nproc > 1) { 401 T id = tid; 402 403 if (tc < nproc) { 404 if (id < tc) { 405 init = id; 406 limit = id; 407 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 408 } else { 409 pr->u.p.count = 1; /* means no more chunks to execute */ 410 pr->u.p.parm1 = FALSE; 411 break; 412 } 413 } else { 414 T small_chunk = tc / nproc; 415 T extras = tc % nproc; 416 init = id * small_chunk + (id < extras ? id : extras); 417 limit = init + small_chunk - (id < extras ? 0 : 1); 418 pr->u.p.parm1 = (id == nproc - 1); 419 } 420 } else { 421 if (tc > 0) { 422 init = 0; 423 limit = tc - 1; 424 pr->u.p.parm1 = TRUE; 425 } else { 426 // zero trip count 427 pr->u.p.count = 1; /* means no more chunks to execute */ 428 pr->u.p.parm1 = FALSE; 429 break; 430 } 431 } 432 #if USE_ITT_BUILD 433 // Calculate chunk for metadata report 434 if (itt_need_metadata_reporting) 435 if (cur_chunk) 436 *cur_chunk = limit - init + 1; 437 #endif 438 if (st == 1) { 439 pr->u.p.lb = lb + init; 440 pr->u.p.ub = lb + limit; 441 } else { 442 // calculated upper bound, "ub" is user-defined upper bound 443 T ub_tmp = lb + limit * st; 444 pr->u.p.lb = lb + init * st; 445 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 446 // it exactly 447 if (st > 0) { 448 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 449 } else { 450 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 451 } 452 } 453 if (pr->flags.ordered) { 454 pr->u.p.ordered_lower = init; 455 pr->u.p.ordered_upper = limit; 456 } 457 break; 458 } // case 459 case kmp_sch_static_balanced_chunked: { 460 // similar to balanced, but chunk adjusted to multiple of simd width 461 T nth = nproc; 462 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 463 " -> falling-through to static_greedy\n", 464 gtid)); 465 schedule = kmp_sch_static_greedy; 466 if (nth > 1) 467 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 468 else 469 pr->u.p.parm1 = tc; 470 break; 471 } // case 472 case kmp_sch_guided_simd: 473 case kmp_sch_guided_iterative_chunked: { 474 KD_TRACE( 475 100, 476 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 477 " case\n", 478 gtid)); 479 480 if (nproc > 1) { 481 if ((2L * chunk + 1) * nproc >= tc) { 482 /* chunk size too large, switch to dynamic */ 483 schedule = kmp_sch_dynamic_chunked; 484 } else { 485 // when remaining iters become less than parm2 - switch to dynamic 486 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 487 *(double *)&pr->u.p.parm3 = 488 guided_flt_param / nproc; // may occupy parm3 and parm4 489 } 490 } else { 491 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 492 "kmp_sch_static_greedy\n", 493 gtid)); 494 schedule = kmp_sch_static_greedy; 495 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 496 KD_TRACE( 497 100, 498 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 499 gtid)); 500 pr->u.p.parm1 = tc; 501 } // if 502 } // case 503 break; 504 case kmp_sch_guided_analytical_chunked: { 505 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 506 "kmp_sch_guided_analytical_chunked case\n", 507 gtid)); 508 509 if (nproc > 1) { 510 if ((2L * chunk + 1) * nproc >= tc) { 511 /* chunk size too large, switch to dynamic */ 512 schedule = kmp_sch_dynamic_chunked; 513 } else { 514 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 515 DBL x; 516 517 #if KMP_USE_X87CONTROL 518 /* Linux* OS already has 64-bit computation by default for long double, 519 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 520 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 521 instead of the default 53-bit. Even though long double doesn't work 522 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 523 expected to impact the correctness of the algorithm, but this has not 524 been mathematically proven. */ 525 // save original FPCW and set precision to 64-bit, as 526 // Windows* OS on IA-32 architecture defaults to 53-bit 527 unsigned int oldFpcw = _control87(0, 0); 528 _control87(_PC_64, _MCW_PC); // 0,0x30000 529 #endif 530 /* value used for comparison in solver for cross-over point */ 531 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 532 533 /* crossover point--chunk indexes equal to or greater than 534 this point switch to dynamic-style scheduling */ 535 UT cross; 536 537 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 538 x = (long double)1.0 - (long double)0.5 / nproc; 539 540 #ifdef KMP_DEBUG 541 { // test natural alignment 542 struct _test_a { 543 char a; 544 union { 545 char b; 546 DBL d; 547 }; 548 } t; 549 ptrdiff_t natural_alignment = 550 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 551 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 552 // long)natural_alignment ); 553 KMP_DEBUG_ASSERT( 554 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 555 } 556 #endif // KMP_DEBUG 557 558 /* save the term in thread private dispatch structure */ 559 *(DBL *)&pr->u.p.parm3 = x; 560 561 /* solve for the crossover point to the nearest integer i for which C_i 562 <= chunk */ 563 { 564 UT left, right, mid; 565 long double p; 566 567 /* estimate initial upper and lower bound */ 568 569 /* doesn't matter what value right is as long as it is positive, but 570 it affects performance of the solver */ 571 right = 229; 572 p = __kmp_pow<UT>(x, right); 573 if (p > target) { 574 do { 575 p *= p; 576 right <<= 1; 577 } while (p > target && right < (1 << 27)); 578 /* lower bound is previous (failed) estimate of upper bound */ 579 left = right >> 1; 580 } else { 581 left = 0; 582 } 583 584 /* bisection root-finding method */ 585 while (left + 1 < right) { 586 mid = (left + right) / 2; 587 if (__kmp_pow<UT>(x, mid) > target) { 588 left = mid; 589 } else { 590 right = mid; 591 } 592 } // while 593 cross = right; 594 } 595 /* assert sanity of computed crossover point */ 596 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 597 __kmp_pow<UT>(x, cross) <= target); 598 599 /* save the crossover point in thread private dispatch structure */ 600 pr->u.p.parm2 = cross; 601 602 // C75803 603 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 604 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 605 #else 606 #define GUIDED_ANALYTICAL_WORKAROUND (x) 607 #endif 608 /* dynamic-style scheduling offset */ 609 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 610 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 611 cross * chunk; 612 #if KMP_USE_X87CONTROL 613 // restore FPCW 614 _control87(oldFpcw, _MCW_PC); 615 #endif 616 } // if 617 } else { 618 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 619 "kmp_sch_static_greedy\n", 620 gtid)); 621 schedule = kmp_sch_static_greedy; 622 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 623 pr->u.p.parm1 = tc; 624 } // if 625 } // case 626 break; 627 case kmp_sch_static_greedy: 628 KD_TRACE( 629 100, 630 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 631 gtid)); 632 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 633 break; 634 case kmp_sch_static_chunked: 635 case kmp_sch_dynamic_chunked: 636 if (pr->u.p.parm1 <= 0) { 637 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 638 } 639 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 640 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 641 gtid)); 642 break; 643 case kmp_sch_trapezoidal: { 644 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 645 646 T parm1, parm2, parm3, parm4; 647 KD_TRACE(100, 648 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 649 gtid)); 650 651 parm1 = chunk; 652 653 /* F : size of the first cycle */ 654 parm2 = (tc / (2 * nproc)); 655 656 if (parm2 < 1) { 657 parm2 = 1; 658 } 659 660 /* L : size of the last cycle. Make sure the last cycle is not larger 661 than the first cycle. */ 662 if (parm1 < 1) { 663 parm1 = 1; 664 } else if (parm1 > parm2) { 665 parm1 = parm2; 666 } 667 668 /* N : number of cycles */ 669 parm3 = (parm2 + parm1); 670 parm3 = (2 * tc + parm3 - 1) / parm3; 671 672 if (parm3 < 2) { 673 parm3 = 2; 674 } 675 676 /* sigma : decreasing incr of the trapezoid */ 677 parm4 = (parm3 - 1); 678 parm4 = (parm2 - parm1) / parm4; 679 680 // pointless check, because parm4 >= 0 always 681 // if ( parm4 < 0 ) { 682 // parm4 = 0; 683 //} 684 685 pr->u.p.parm1 = parm1; 686 pr->u.p.parm2 = parm2; 687 pr->u.p.parm3 = parm3; 688 pr->u.p.parm4 = parm4; 689 } // case 690 break; 691 692 default: { 693 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 694 KMP_HNT(GetNewerLibrary), // Hint 695 __kmp_msg_null // Variadic argument list terminator 696 ); 697 } break; 698 } // switch 699 pr->schedule = schedule; 700 } 701 702 #if KMP_USE_HIER_SCHED 703 template <typename T> 704 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 705 typename traits_t<T>::signed_t st); 706 template <> 707 inline void 708 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 709 kmp_int32 ub, kmp_int32 st) { 710 __kmp_dispatch_init_hierarchy<kmp_int32>( 711 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 712 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 713 } 714 template <> 715 inline void 716 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 717 kmp_uint32 ub, kmp_int32 st) { 718 __kmp_dispatch_init_hierarchy<kmp_uint32>( 719 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 720 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 721 } 722 template <> 723 inline void 724 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 725 kmp_int64 ub, kmp_int64 st) { 726 __kmp_dispatch_init_hierarchy<kmp_int64>( 727 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 728 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 729 } 730 template <> 731 inline void 732 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 733 kmp_uint64 ub, kmp_int64 st) { 734 __kmp_dispatch_init_hierarchy<kmp_uint64>( 735 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 736 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 737 } 738 739 // free all the hierarchy scheduling memory associated with the team 740 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 741 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 742 for (int i = 0; i < num_disp_buff; ++i) { 743 // type does not matter here so use kmp_int32 744 auto sh = 745 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 746 &team->t.t_disp_buffer[i]); 747 if (sh->hier) { 748 sh->hier->deallocate(); 749 __kmp_free(sh->hier); 750 } 751 } 752 } 753 #endif 754 755 // UT - unsigned flavor of T, ST - signed flavor of T, 756 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 757 template <typename T> 758 static void 759 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 760 T ub, typename traits_t<T>::signed_t st, 761 typename traits_t<T>::signed_t chunk, int push_ws) { 762 typedef typename traits_t<T>::unsigned_t UT; 763 764 int active; 765 kmp_info_t *th; 766 kmp_team_t *team; 767 kmp_uint32 my_buffer_index; 768 dispatch_private_info_template<T> *pr; 769 dispatch_shared_info_template<T> volatile *sh; 770 771 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 772 sizeof(dispatch_private_info)); 773 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 774 sizeof(dispatch_shared_info)); 775 776 if (!TCR_4(__kmp_init_parallel)) 777 __kmp_parallel_initialize(); 778 779 __kmp_resume_if_soft_paused(); 780 781 #if INCLUDE_SSC_MARKS 782 SSC_MARK_DISPATCH_INIT(); 783 #endif 784 #ifdef KMP_DEBUG 785 typedef typename traits_t<T>::signed_t ST; 786 { 787 char *buff; 788 // create format specifiers before the debug output 789 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 790 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 791 traits_t<ST>::spec, traits_t<T>::spec, 792 traits_t<T>::spec, traits_t<ST>::spec); 793 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 794 __kmp_str_free(&buff); 795 } 796 #endif 797 /* setup data */ 798 th = __kmp_threads[gtid]; 799 team = th->th.th_team; 800 active = !team->t.t_serialized; 801 th->th.th_ident = loc; 802 803 // Any half-decent optimizer will remove this test when the blocks are empty 804 // since the macros expand to nothing 805 // when statistics are disabled. 806 if (schedule == __kmp_static) { 807 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 808 } else { 809 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 810 } 811 812 #if KMP_USE_HIER_SCHED 813 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 814 // Hierarchical scheduling does not work with ordered, so if ordered is 815 // detected, then revert back to threaded scheduling. 816 bool ordered; 817 enum sched_type my_sched = schedule; 818 my_buffer_index = th->th.th_dispatch->th_disp_index; 819 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 820 &th->th.th_dispatch 821 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 822 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 823 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 824 my_sched = 825 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 826 ordered = (kmp_ord_lower & my_sched); 827 if (pr->flags.use_hier) { 828 if (ordered) { 829 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 830 "Disabling hierarchical scheduling.\n", 831 gtid)); 832 pr->flags.use_hier = FALSE; 833 } 834 } 835 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 836 // Don't use hierarchical for ordered parallel loops and don't 837 // use the runtime hierarchy if one was specified in the program 838 if (!ordered && !pr->flags.use_hier) 839 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 840 } 841 #endif // KMP_USE_HIER_SCHED 842 843 #if USE_ITT_BUILD 844 kmp_uint64 cur_chunk = chunk; 845 int itt_need_metadata_reporting = 846 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 847 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 848 team->t.t_active_level == 1; 849 #endif 850 if (!active) { 851 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 852 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 853 } else { 854 KMP_DEBUG_ASSERT(th->th.th_dispatch == 855 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 856 857 my_buffer_index = th->th.th_dispatch->th_disp_index++; 858 859 /* What happens when number of threads changes, need to resize buffer? */ 860 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 861 &th->th.th_dispatch 862 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 863 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 864 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 865 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 866 my_buffer_index)); 867 } 868 869 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 870 #if USE_ITT_BUILD 871 &cur_chunk, 872 #endif 873 chunk, (T)th->th.th_team_nproc, 874 (T)th->th.th_info.ds.ds_tid); 875 if (active) { 876 if (pr->flags.ordered == 0) { 877 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 878 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 879 } else { 880 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 881 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 882 } 883 } 884 885 if (active) { 886 /* The name of this buffer should be my_buffer_index when it's free to use 887 * it */ 888 889 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 890 "sh->buffer_index:%d\n", 891 gtid, my_buffer_index, sh->buffer_index)); 892 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, 893 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 894 // Note: KMP_WAIT() cannot be used there: buffer index and 895 // my_buffer_index are *always* 32-bit integers. 896 KMP_MB(); /* is this necessary? */ 897 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 898 "sh->buffer_index:%d\n", 899 gtid, my_buffer_index, sh->buffer_index)); 900 901 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 902 th->th.th_dispatch->th_dispatch_sh_current = 903 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 904 #if USE_ITT_BUILD 905 if (pr->flags.ordered) { 906 __kmp_itt_ordered_init(gtid); 907 } 908 // Report loop metadata 909 if (itt_need_metadata_reporting) { 910 // Only report metadata by master of active team at level 1 911 kmp_uint64 schedtype = 0; 912 switch (schedule) { 913 case kmp_sch_static_chunked: 914 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 915 break; 916 case kmp_sch_static_greedy: 917 cur_chunk = pr->u.p.parm1; 918 break; 919 case kmp_sch_dynamic_chunked: 920 schedtype = 1; 921 break; 922 case kmp_sch_guided_iterative_chunked: 923 case kmp_sch_guided_analytical_chunked: 924 case kmp_sch_guided_simd: 925 schedtype = 2; 926 break; 927 default: 928 // Should we put this case under "static"? 929 // case kmp_sch_static_steal: 930 schedtype = 3; 931 break; 932 } 933 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 934 } 935 #if KMP_USE_HIER_SCHED 936 if (pr->flags.use_hier) { 937 pr->u.p.count = 0; 938 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 939 } 940 #endif // KMP_USER_HIER_SCHED 941 #endif /* USE_ITT_BUILD */ 942 } 943 944 #ifdef KMP_DEBUG 945 { 946 char *buff; 947 // create format specifiers before the debug output 948 buff = __kmp_str_format( 949 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 950 "lb:%%%s ub:%%%s" 951 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 952 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 953 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 954 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 955 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 956 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 957 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 958 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 959 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 960 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 961 __kmp_str_free(&buff); 962 } 963 #endif 964 #if (KMP_STATIC_STEAL_ENABLED) 965 // It cannot be guaranteed that after execution of a loop with some other 966 // schedule kind all the parm3 variables will contain the same value. Even if 967 // all parm3 will be the same, it still exists a bad case like using 0 and 1 968 // rather than program life-time increment. So the dedicated variable is 969 // required. The 'static_steal_counter' is used. 970 if (schedule == kmp_sch_static_steal) { 971 // Other threads will inspect this variable when searching for a victim. 972 // This is a flag showing that other threads may steal from this thread 973 // since then. 974 volatile T *p = &pr->u.p.static_steal_counter; 975 *p = *p + 1; 976 } 977 #endif // ( KMP_STATIC_STEAL_ENABLED ) 978 979 #if OMPT_SUPPORT && OMPT_OPTIONAL 980 if (ompt_enabled.ompt_callback_work) { 981 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 982 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 983 ompt_callbacks.ompt_callback(ompt_callback_work)( 984 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 985 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 986 } 987 #endif 988 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 989 } 990 991 /* For ordered loops, either __kmp_dispatch_finish() should be called after 992 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 993 * every chunk of iterations. If the ordered section(s) were not executed 994 * for this iteration (or every iteration in this chunk), we need to set the 995 * ordered iteration counters so that the next thread can proceed. */ 996 template <typename UT> 997 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 998 typedef typename traits_t<UT>::signed_t ST; 999 kmp_info_t *th = __kmp_threads[gtid]; 1000 1001 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1002 if (!th->th.th_team->t.t_serialized) { 1003 1004 dispatch_private_info_template<UT> *pr = 1005 reinterpret_cast<dispatch_private_info_template<UT> *>( 1006 th->th.th_dispatch->th_dispatch_pr_current); 1007 dispatch_shared_info_template<UT> volatile *sh = 1008 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1009 th->th.th_dispatch->th_dispatch_sh_current); 1010 KMP_DEBUG_ASSERT(pr); 1011 KMP_DEBUG_ASSERT(sh); 1012 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1013 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1014 1015 if (pr->ordered_bumped) { 1016 KD_TRACE( 1017 1000, 1018 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1019 gtid)); 1020 pr->ordered_bumped = 0; 1021 } else { 1022 UT lower = pr->u.p.ordered_lower; 1023 1024 #ifdef KMP_DEBUG 1025 { 1026 char *buff; 1027 // create format specifiers before the debug output 1028 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1029 "ordered_iteration:%%%s lower:%%%s\n", 1030 traits_t<UT>::spec, traits_t<UT>::spec); 1031 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1032 __kmp_str_free(&buff); 1033 } 1034 #endif 1035 1036 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1037 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1038 KMP_MB(); /* is this necessary? */ 1039 #ifdef KMP_DEBUG 1040 { 1041 char *buff; 1042 // create format specifiers before the debug output 1043 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1044 "ordered_iteration:%%%s lower:%%%s\n", 1045 traits_t<UT>::spec, traits_t<UT>::spec); 1046 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1047 __kmp_str_free(&buff); 1048 } 1049 #endif 1050 1051 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1052 } // if 1053 } // if 1054 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1055 } 1056 1057 #ifdef KMP_GOMP_COMPAT 1058 1059 template <typename UT> 1060 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1061 typedef typename traits_t<UT>::signed_t ST; 1062 kmp_info_t *th = __kmp_threads[gtid]; 1063 1064 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1065 if (!th->th.th_team->t.t_serialized) { 1066 // int cid; 1067 dispatch_private_info_template<UT> *pr = 1068 reinterpret_cast<dispatch_private_info_template<UT> *>( 1069 th->th.th_dispatch->th_dispatch_pr_current); 1070 dispatch_shared_info_template<UT> volatile *sh = 1071 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1072 th->th.th_dispatch->th_dispatch_sh_current); 1073 KMP_DEBUG_ASSERT(pr); 1074 KMP_DEBUG_ASSERT(sh); 1075 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1076 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1077 1078 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1079 UT lower = pr->u.p.ordered_lower; 1080 UT upper = pr->u.p.ordered_upper; 1081 UT inc = upper - lower + 1; 1082 1083 if (pr->ordered_bumped == inc) { 1084 KD_TRACE( 1085 1000, 1086 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1087 gtid)); 1088 pr->ordered_bumped = 0; 1089 } else { 1090 inc -= pr->ordered_bumped; 1091 1092 #ifdef KMP_DEBUG 1093 { 1094 char *buff; 1095 // create format specifiers before the debug output 1096 buff = __kmp_str_format( 1097 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1098 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1099 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1100 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1101 __kmp_str_free(&buff); 1102 } 1103 #endif 1104 1105 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1106 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1107 1108 KMP_MB(); /* is this necessary? */ 1109 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1110 "ordered_bumped to zero\n", 1111 gtid)); 1112 pr->ordered_bumped = 0; 1113 //!!!!! TODO check if the inc should be unsigned, or signed??? 1114 #ifdef KMP_DEBUG 1115 { 1116 char *buff; 1117 // create format specifiers before the debug output 1118 buff = __kmp_str_format( 1119 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1120 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1121 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1122 traits_t<UT>::spec); 1123 KD_TRACE(1000, 1124 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1125 __kmp_str_free(&buff); 1126 } 1127 #endif 1128 1129 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1130 } 1131 // } 1132 } 1133 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1134 } 1135 1136 #endif /* KMP_GOMP_COMPAT */ 1137 1138 template <typename T> 1139 int __kmp_dispatch_next_algorithm(int gtid, 1140 dispatch_private_info_template<T> *pr, 1141 dispatch_shared_info_template<T> volatile *sh, 1142 kmp_int32 *p_last, T *p_lb, T *p_ub, 1143 typename traits_t<T>::signed_t *p_st, T nproc, 1144 T tid) { 1145 typedef typename traits_t<T>::unsigned_t UT; 1146 typedef typename traits_t<T>::signed_t ST; 1147 typedef typename traits_t<T>::floating_t DBL; 1148 int status = 0; 1149 kmp_int32 last = 0; 1150 T start; 1151 ST incr; 1152 UT limit, trip, init; 1153 kmp_info_t *th = __kmp_threads[gtid]; 1154 kmp_team_t *team = th->th.th_team; 1155 1156 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1157 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1158 KMP_DEBUG_ASSERT(pr); 1159 KMP_DEBUG_ASSERT(sh); 1160 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1161 #ifdef KMP_DEBUG 1162 { 1163 char *buff; 1164 // create format specifiers before the debug output 1165 buff = 1166 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1167 "sh:%%p nproc:%%%s tid:%%%s\n", 1168 traits_t<T>::spec, traits_t<T>::spec); 1169 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1170 __kmp_str_free(&buff); 1171 } 1172 #endif 1173 1174 // zero trip count 1175 if (pr->u.p.tc == 0) { 1176 KD_TRACE(10, 1177 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1178 "zero status:%d\n", 1179 gtid, status)); 1180 return 0; 1181 } 1182 1183 switch (pr->schedule) { 1184 #if (KMP_STATIC_STEAL_ENABLED) 1185 case kmp_sch_static_steal: { 1186 T chunk = pr->u.p.parm1; 1187 1188 KD_TRACE(100, 1189 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1190 gtid)); 1191 1192 trip = pr->u.p.tc - 1; 1193 1194 if (traits_t<T>::type_size > 4) { 1195 // use lock for 8-byte and CAS for 4-byte induction 1196 // variable. TODO (optional): check and use 16-byte CAS 1197 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock; 1198 KMP_DEBUG_ASSERT(lck != NULL); 1199 if (pr->u.p.count < (UT)pr->u.p.ub) { 1200 __kmp_acquire_lock(lck, gtid); 1201 // try to get own chunk of iterations 1202 init = (pr->u.p.count)++; 1203 status = (init < (UT)pr->u.p.ub); 1204 __kmp_release_lock(lck, gtid); 1205 } else { 1206 status = 0; // no own chunks 1207 } 1208 if (!status) { // try to steal 1209 kmp_info_t **other_threads = team->t.t_threads; 1210 int while_limit = pr->u.p.parm3; 1211 int while_index = 0; 1212 // TODO: algorithm of searching for a victim 1213 // should be cleaned up and measured 1214 while ((!status) && (while_limit != ++while_index)) { 1215 T remaining; 1216 T victimIdx = pr->u.p.parm4; 1217 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1218 dispatch_private_info_template<T> *victim = 1219 reinterpret_cast<dispatch_private_info_template<T> *>( 1220 other_threads[victimIdx] 1221 ->th.th_dispatch->th_dispatch_pr_current); 1222 while ((victim == NULL || victim == pr || 1223 (*(volatile T *)&victim->u.p.static_steal_counter != 1224 *(volatile T *)&pr->u.p.static_steal_counter)) && 1225 oldVictimIdx != victimIdx) { 1226 victimIdx = (victimIdx + 1) % nproc; 1227 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1228 other_threads[victimIdx] 1229 ->th.th_dispatch->th_dispatch_pr_current); 1230 } 1231 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1232 *(volatile T *)&pr->u.p.static_steal_counter)) { 1233 continue; // try once more (nproc attempts in total) 1234 // no victim is ready yet to participate in stealing 1235 // because all victims are still in kmp_init_dispatch 1236 } 1237 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1238 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1239 continue; // not enough chunks to steal, goto next victim 1240 } 1241 1242 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1243 KMP_ASSERT(lck != NULL); 1244 __kmp_acquire_lock(lck, gtid); 1245 limit = victim->u.p.ub; // keep initial ub 1246 if (victim->u.p.count >= limit || 1247 (remaining = limit - victim->u.p.count) < 2) { 1248 __kmp_release_lock(lck, gtid); 1249 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1250 continue; // not enough chunks to steal 1251 } 1252 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or 1253 // by 1 1254 if (remaining > 3) { 1255 // steal 1/4 of remaining 1256 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1257 init = (victim->u.p.ub -= (remaining >> 2)); 1258 } else { 1259 // steal 1 chunk of 2 or 3 remaining 1260 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1261 init = (victim->u.p.ub -= 1); 1262 } 1263 __kmp_release_lock(lck, gtid); 1264 1265 KMP_DEBUG_ASSERT(init + 1 <= limit); 1266 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1267 status = 1; 1268 while_index = 0; 1269 // now update own count and ub with stolen range but init chunk 1270 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1271 pr->u.p.count = init + 1; 1272 pr->u.p.ub = limit; 1273 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1274 } // while (search for victim) 1275 } // if (try to find victim and steal) 1276 } else { 1277 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1278 typedef union { 1279 struct { 1280 UT count; 1281 T ub; 1282 } p; 1283 kmp_int64 b; 1284 } union_i4; 1285 // All operations on 'count' or 'ub' must be combined atomically 1286 // together. 1287 { 1288 union_i4 vold, vnew; 1289 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1290 vnew = vold; 1291 vnew.p.count++; 1292 while (!KMP_COMPARE_AND_STORE_ACQ64( 1293 (volatile kmp_int64 *)&pr->u.p.count, 1294 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1295 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1296 KMP_CPU_PAUSE(); 1297 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1298 vnew = vold; 1299 vnew.p.count++; 1300 } 1301 vnew = vold; 1302 init = vnew.p.count; 1303 status = (init < (UT)vnew.p.ub); 1304 } 1305 1306 if (!status) { 1307 kmp_info_t **other_threads = team->t.t_threads; 1308 int while_limit = pr->u.p.parm3; 1309 int while_index = 0; 1310 1311 // TODO: algorithm of searching for a victim 1312 // should be cleaned up and measured 1313 while ((!status) && (while_limit != ++while_index)) { 1314 union_i4 vold, vnew; 1315 kmp_int32 remaining; 1316 T victimIdx = pr->u.p.parm4; 1317 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1318 dispatch_private_info_template<T> *victim = 1319 reinterpret_cast<dispatch_private_info_template<T> *>( 1320 other_threads[victimIdx] 1321 ->th.th_dispatch->th_dispatch_pr_current); 1322 while ((victim == NULL || victim == pr || 1323 (*(volatile T *)&victim->u.p.static_steal_counter != 1324 *(volatile T *)&pr->u.p.static_steal_counter)) && 1325 oldVictimIdx != victimIdx) { 1326 victimIdx = (victimIdx + 1) % nproc; 1327 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1328 other_threads[victimIdx] 1329 ->th.th_dispatch->th_dispatch_pr_current); 1330 } 1331 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1332 *(volatile T *)&pr->u.p.static_steal_counter)) { 1333 continue; // try once more (nproc attempts in total) 1334 // no victim is ready yet to participate in stealing 1335 // because all victims are still in kmp_init_dispatch 1336 } 1337 pr->u.p.parm4 = victimIdx; // new victim found 1338 while (1) { // CAS loop if victim has enough chunks to steal 1339 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1340 vnew = vold; 1341 1342 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1343 if (vnew.p.count >= (UT)vnew.p.ub || 1344 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1345 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1346 break; // not enough chunks to steal, goto next victim 1347 } 1348 if (remaining > 3) { 1349 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining 1350 } else { 1351 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1352 } 1353 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1354 // TODO: Should this be acquire or release? 1355 if (KMP_COMPARE_AND_STORE_ACQ64( 1356 (volatile kmp_int64 *)&victim->u.p.count, 1357 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1358 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1359 // stealing succedded 1360 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1361 vold.p.ub - vnew.p.ub); 1362 status = 1; 1363 while_index = 0; 1364 // now update own count and ub 1365 init = vnew.p.ub; 1366 vold.p.count = init + 1; 1367 #if KMP_ARCH_X86 1368 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1369 #else 1370 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1371 #endif 1372 break; 1373 } // if (check CAS result) 1374 KMP_CPU_PAUSE(); // CAS failed, repeate attempt 1375 } // while (try to steal from particular victim) 1376 } // while (search for victim) 1377 } // if (try to find victim and steal) 1378 } // if (4-byte induction variable) 1379 if (!status) { 1380 *p_lb = 0; 1381 *p_ub = 0; 1382 if (p_st != NULL) 1383 *p_st = 0; 1384 } else { 1385 start = pr->u.p.parm2; 1386 init *= chunk; 1387 limit = chunk + init - 1; 1388 incr = pr->u.p.st; 1389 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1390 1391 KMP_DEBUG_ASSERT(init <= trip); 1392 if ((last = (limit >= trip)) != 0) 1393 limit = trip; 1394 if (p_st != NULL) 1395 *p_st = incr; 1396 1397 if (incr == 1) { 1398 *p_lb = start + init; 1399 *p_ub = start + limit; 1400 } else { 1401 *p_lb = start + init * incr; 1402 *p_ub = start + limit * incr; 1403 } 1404 1405 if (pr->flags.ordered) { 1406 pr->u.p.ordered_lower = init; 1407 pr->u.p.ordered_upper = limit; 1408 } // if 1409 } // if 1410 break; 1411 } // case 1412 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1413 case kmp_sch_static_balanced: { 1414 KD_TRACE( 1415 10, 1416 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1417 gtid)); 1418 /* check if thread has any iteration to do */ 1419 if ((status = !pr->u.p.count) != 0) { 1420 pr->u.p.count = 1; 1421 *p_lb = pr->u.p.lb; 1422 *p_ub = pr->u.p.ub; 1423 last = pr->u.p.parm1; 1424 if (p_st != NULL) 1425 *p_st = pr->u.p.st; 1426 } else { /* no iterations to do */ 1427 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1428 } 1429 } // case 1430 break; 1431 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1432 merged here */ 1433 case kmp_sch_static_chunked: { 1434 T parm1; 1435 1436 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1437 "kmp_sch_static_[affinity|chunked] case\n", 1438 gtid)); 1439 parm1 = pr->u.p.parm1; 1440 1441 trip = pr->u.p.tc - 1; 1442 init = parm1 * (pr->u.p.count + tid); 1443 1444 if ((status = (init <= trip)) != 0) { 1445 start = pr->u.p.lb; 1446 incr = pr->u.p.st; 1447 limit = parm1 + init - 1; 1448 1449 if ((last = (limit >= trip)) != 0) 1450 limit = trip; 1451 1452 if (p_st != NULL) 1453 *p_st = incr; 1454 1455 pr->u.p.count += nproc; 1456 1457 if (incr == 1) { 1458 *p_lb = start + init; 1459 *p_ub = start + limit; 1460 } else { 1461 *p_lb = start + init * incr; 1462 *p_ub = start + limit * incr; 1463 } 1464 1465 if (pr->flags.ordered) { 1466 pr->u.p.ordered_lower = init; 1467 pr->u.p.ordered_upper = limit; 1468 } // if 1469 } // if 1470 } // case 1471 break; 1472 1473 case kmp_sch_dynamic_chunked: { 1474 T chunk = pr->u.p.parm1; 1475 1476 KD_TRACE( 1477 100, 1478 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1479 gtid)); 1480 1481 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1482 trip = pr->u.p.tc - 1; 1483 1484 if ((status = (init <= trip)) == 0) { 1485 *p_lb = 0; 1486 *p_ub = 0; 1487 if (p_st != NULL) 1488 *p_st = 0; 1489 } else { 1490 start = pr->u.p.lb; 1491 limit = chunk + init - 1; 1492 incr = pr->u.p.st; 1493 1494 if ((last = (limit >= trip)) != 0) 1495 limit = trip; 1496 1497 if (p_st != NULL) 1498 *p_st = incr; 1499 1500 if (incr == 1) { 1501 *p_lb = start + init; 1502 *p_ub = start + limit; 1503 } else { 1504 *p_lb = start + init * incr; 1505 *p_ub = start + limit * incr; 1506 } 1507 1508 if (pr->flags.ordered) { 1509 pr->u.p.ordered_lower = init; 1510 pr->u.p.ordered_upper = limit; 1511 } // if 1512 } // if 1513 } // case 1514 break; 1515 1516 case kmp_sch_guided_iterative_chunked: { 1517 T chunkspec = pr->u.p.parm1; 1518 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1519 "iterative case\n", 1520 gtid)); 1521 trip = pr->u.p.tc; 1522 // Start atomic part of calculations 1523 while (1) { 1524 ST remaining; // signed, because can be < 0 1525 init = sh->u.s.iteration; // shared value 1526 remaining = trip - init; 1527 if (remaining <= 0) { // AC: need to compare with 0 first 1528 // nothing to do, don't try atomic op 1529 status = 0; 1530 break; 1531 } 1532 if ((T)remaining < 1533 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1534 // use dynamic-style shcedule 1535 // atomically inrement iterations, get old value 1536 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1537 (ST)chunkspec); 1538 remaining = trip - init; 1539 if (remaining <= 0) { 1540 status = 0; // all iterations got by other threads 1541 } else { 1542 // got some iterations to work on 1543 status = 1; 1544 if ((T)remaining > chunkspec) { 1545 limit = init + chunkspec - 1; 1546 } else { 1547 last = 1; // the last chunk 1548 limit = init + remaining - 1; 1549 } // if 1550 } // if 1551 break; 1552 } // if 1553 limit = init + 1554 (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc 1555 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1556 (ST)init, (ST)limit)) { 1557 // CAS was successful, chunk obtained 1558 status = 1; 1559 --limit; 1560 break; 1561 } // if 1562 } // while 1563 if (status != 0) { 1564 start = pr->u.p.lb; 1565 incr = pr->u.p.st; 1566 if (p_st != NULL) 1567 *p_st = incr; 1568 *p_lb = start + init * incr; 1569 *p_ub = start + limit * incr; 1570 if (pr->flags.ordered) { 1571 pr->u.p.ordered_lower = init; 1572 pr->u.p.ordered_upper = limit; 1573 } // if 1574 } else { 1575 *p_lb = 0; 1576 *p_ub = 0; 1577 if (p_st != NULL) 1578 *p_st = 0; 1579 } // if 1580 } // case 1581 break; 1582 1583 case kmp_sch_guided_simd: { 1584 // same as iterative but curr-chunk adjusted to be multiple of given 1585 // chunk 1586 T chunk = pr->u.p.parm1; 1587 KD_TRACE(100, 1588 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1589 gtid)); 1590 trip = pr->u.p.tc; 1591 // Start atomic part of calculations 1592 while (1) { 1593 ST remaining; // signed, because can be < 0 1594 init = sh->u.s.iteration; // shared value 1595 remaining = trip - init; 1596 if (remaining <= 0) { // AC: need to compare with 0 first 1597 status = 0; // nothing to do, don't try atomic op 1598 break; 1599 } 1600 KMP_DEBUG_ASSERT(init % chunk == 0); 1601 // compare with K*nproc*(chunk+1), K=2 by default 1602 if ((T)remaining < pr->u.p.parm2) { 1603 // use dynamic-style shcedule 1604 // atomically inrement iterations, get old value 1605 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1606 (ST)chunk); 1607 remaining = trip - init; 1608 if (remaining <= 0) { 1609 status = 0; // all iterations got by other threads 1610 } else { 1611 // got some iterations to work on 1612 status = 1; 1613 if ((T)remaining > chunk) { 1614 limit = init + chunk - 1; 1615 } else { 1616 last = 1; // the last chunk 1617 limit = init + remaining - 1; 1618 } // if 1619 } // if 1620 break; 1621 } // if 1622 // divide by K*nproc 1623 UT span = remaining * (*(double *)&pr->u.p.parm3); 1624 UT rem = span % chunk; 1625 if (rem) // adjust so that span%chunk == 0 1626 span += chunk - rem; 1627 limit = init + span; 1628 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1629 (ST)init, (ST)limit)) { 1630 // CAS was successful, chunk obtained 1631 status = 1; 1632 --limit; 1633 break; 1634 } // if 1635 } // while 1636 if (status != 0) { 1637 start = pr->u.p.lb; 1638 incr = pr->u.p.st; 1639 if (p_st != NULL) 1640 *p_st = incr; 1641 *p_lb = start + init * incr; 1642 *p_ub = start + limit * incr; 1643 if (pr->flags.ordered) { 1644 pr->u.p.ordered_lower = init; 1645 pr->u.p.ordered_upper = limit; 1646 } // if 1647 } else { 1648 *p_lb = 0; 1649 *p_ub = 0; 1650 if (p_st != NULL) 1651 *p_st = 0; 1652 } // if 1653 } // case 1654 break; 1655 1656 case kmp_sch_guided_analytical_chunked: { 1657 T chunkspec = pr->u.p.parm1; 1658 UT chunkIdx; 1659 #if KMP_USE_X87CONTROL 1660 /* for storing original FPCW value for Windows* OS on 1661 IA-32 architecture 8-byte version */ 1662 unsigned int oldFpcw; 1663 unsigned int fpcwSet = 0; 1664 #endif 1665 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1666 "kmp_sch_guided_analytical_chunked case\n", 1667 gtid)); 1668 1669 trip = pr->u.p.tc; 1670 1671 KMP_DEBUG_ASSERT(nproc > 1); 1672 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1673 1674 while (1) { /* this while loop is a safeguard against unexpected zero 1675 chunk sizes */ 1676 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1677 if (chunkIdx >= (UT)pr->u.p.parm2) { 1678 --trip; 1679 /* use dynamic-style scheduling */ 1680 init = chunkIdx * chunkspec + pr->u.p.count; 1681 /* need to verify init > 0 in case of overflow in the above 1682 * calculation */ 1683 if ((status = (init > 0 && init <= trip)) != 0) { 1684 limit = init + chunkspec - 1; 1685 1686 if ((last = (limit >= trip)) != 0) 1687 limit = trip; 1688 } 1689 break; 1690 } else { 1691 /* use exponential-style scheduling */ 1692 /* The following check is to workaround the lack of long double precision on 1693 Windows* OS. 1694 This check works around the possible effect that init != 0 for chunkIdx == 0. 1695 */ 1696 #if KMP_USE_X87CONTROL 1697 /* If we haven't already done so, save original 1698 FPCW and set precision to 64-bit, as Windows* OS 1699 on IA-32 architecture defaults to 53-bit */ 1700 if (!fpcwSet) { 1701 oldFpcw = _control87(0, 0); 1702 _control87(_PC_64, _MCW_PC); 1703 fpcwSet = 0x30000; 1704 } 1705 #endif 1706 if (chunkIdx) { 1707 init = __kmp_dispatch_guided_remaining<T>( 1708 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1709 KMP_DEBUG_ASSERT(init); 1710 init = trip - init; 1711 } else 1712 init = 0; 1713 limit = trip - __kmp_dispatch_guided_remaining<T>( 1714 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1715 KMP_ASSERT(init <= limit); 1716 if (init < limit) { 1717 KMP_DEBUG_ASSERT(limit <= trip); 1718 --limit; 1719 status = 1; 1720 break; 1721 } // if 1722 } // if 1723 } // while (1) 1724 #if KMP_USE_X87CONTROL 1725 /* restore FPCW if necessary 1726 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1727 */ 1728 if (fpcwSet && (oldFpcw & fpcwSet)) 1729 _control87(oldFpcw, _MCW_PC); 1730 #endif 1731 if (status != 0) { 1732 start = pr->u.p.lb; 1733 incr = pr->u.p.st; 1734 if (p_st != NULL) 1735 *p_st = incr; 1736 *p_lb = start + init * incr; 1737 *p_ub = start + limit * incr; 1738 if (pr->flags.ordered) { 1739 pr->u.p.ordered_lower = init; 1740 pr->u.p.ordered_upper = limit; 1741 } 1742 } else { 1743 *p_lb = 0; 1744 *p_ub = 0; 1745 if (p_st != NULL) 1746 *p_st = 0; 1747 } 1748 } // case 1749 break; 1750 1751 case kmp_sch_trapezoidal: { 1752 UT index; 1753 T parm2 = pr->u.p.parm2; 1754 T parm3 = pr->u.p.parm3; 1755 T parm4 = pr->u.p.parm4; 1756 KD_TRACE(100, 1757 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1758 gtid)); 1759 1760 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1761 1762 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1763 trip = pr->u.p.tc - 1; 1764 1765 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1766 *p_lb = 0; 1767 *p_ub = 0; 1768 if (p_st != NULL) 1769 *p_st = 0; 1770 } else { 1771 start = pr->u.p.lb; 1772 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1773 incr = pr->u.p.st; 1774 1775 if ((last = (limit >= trip)) != 0) 1776 limit = trip; 1777 1778 if (p_st != NULL) 1779 *p_st = incr; 1780 1781 if (incr == 1) { 1782 *p_lb = start + init; 1783 *p_ub = start + limit; 1784 } else { 1785 *p_lb = start + init * incr; 1786 *p_ub = start + limit * incr; 1787 } 1788 1789 if (pr->flags.ordered) { 1790 pr->u.p.ordered_lower = init; 1791 pr->u.p.ordered_upper = limit; 1792 } // if 1793 } // if 1794 } // case 1795 break; 1796 default: { 1797 status = 0; // to avoid complaints on uninitialized variable use 1798 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1799 KMP_HNT(GetNewerLibrary), // Hint 1800 __kmp_msg_null // Variadic argument list terminator 1801 ); 1802 } break; 1803 } // switch 1804 if (p_last) 1805 *p_last = last; 1806 #ifdef KMP_DEBUG 1807 if (pr->flags.ordered) { 1808 char *buff; 1809 // create format specifiers before the debug output 1810 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1811 "ordered_lower:%%%s ordered_upper:%%%s\n", 1812 traits_t<UT>::spec, traits_t<UT>::spec); 1813 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1814 __kmp_str_free(&buff); 1815 } 1816 { 1817 char *buff; 1818 // create format specifiers before the debug output 1819 buff = __kmp_str_format( 1820 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1821 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1822 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1823 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1824 __kmp_str_free(&buff); 1825 } 1826 #endif 1827 return status; 1828 } 1829 1830 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1831 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1832 is not called. */ 1833 #if OMPT_SUPPORT && OMPT_OPTIONAL 1834 #define OMPT_LOOP_END \ 1835 if (status == 0) { \ 1836 if (ompt_enabled.ompt_callback_work) { \ 1837 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1838 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1839 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1840 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1841 &(task_info->task_data), 0, codeptr); \ 1842 } \ 1843 } 1844 // TODO: implement count 1845 #else 1846 #define OMPT_LOOP_END // no-op 1847 #endif 1848 1849 #if KMP_STATS_ENABLED 1850 #define KMP_STATS_LOOP_END \ 1851 { \ 1852 kmp_int64 u, l, t, i; \ 1853 l = (kmp_int64)(*p_lb); \ 1854 u = (kmp_int64)(*p_ub); \ 1855 i = (kmp_int64)(pr->u.p.st); \ 1856 if (status == 0) { \ 1857 t = 0; \ 1858 KMP_POP_PARTITIONED_TIMER(); \ 1859 } else if (i == 1) { \ 1860 if (u >= l) \ 1861 t = u - l + 1; \ 1862 else \ 1863 t = 0; \ 1864 } else if (i < 0) { \ 1865 if (l >= u) \ 1866 t = (l - u) / (-i) + 1; \ 1867 else \ 1868 t = 0; \ 1869 } else { \ 1870 if (u >= l) \ 1871 t = (u - l) / i + 1; \ 1872 else \ 1873 t = 0; \ 1874 } \ 1875 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1876 } 1877 #else 1878 #define KMP_STATS_LOOP_END /* Nothing */ 1879 #endif 1880 1881 template <typename T> 1882 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1883 T *p_lb, T *p_ub, 1884 typename traits_t<T>::signed_t *p_st 1885 #if OMPT_SUPPORT && OMPT_OPTIONAL 1886 , 1887 void *codeptr 1888 #endif 1889 ) { 1890 1891 typedef typename traits_t<T>::unsigned_t UT; 1892 typedef typename traits_t<T>::signed_t ST; 1893 // This is potentially slightly misleading, schedule(runtime) will appear here 1894 // even if the actual runtme schedule is static. (Which points out a 1895 // disadavantage of schedule(runtime): even when static scheduling is used it 1896 // costs more than a compile time choice to use static scheduling would.) 1897 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 1898 1899 int status; 1900 dispatch_private_info_template<T> *pr; 1901 kmp_info_t *th = __kmp_threads[gtid]; 1902 kmp_team_t *team = th->th.th_team; 1903 1904 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1905 KD_TRACE( 1906 1000, 1907 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 1908 gtid, p_lb, p_ub, p_st, p_last)); 1909 1910 if (team->t.t_serialized) { 1911 /* NOTE: serialize this dispatch becase we are not at the active level */ 1912 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1913 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1914 KMP_DEBUG_ASSERT(pr); 1915 1916 if ((status = (pr->u.p.tc != 0)) == 0) { 1917 *p_lb = 0; 1918 *p_ub = 0; 1919 // if ( p_last != NULL ) 1920 // *p_last = 0; 1921 if (p_st != NULL) 1922 *p_st = 0; 1923 if (__kmp_env_consistency_check) { 1924 if (pr->pushed_ws != ct_none) { 1925 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1926 } 1927 } 1928 } else if (pr->flags.nomerge) { 1929 kmp_int32 last; 1930 T start; 1931 UT limit, trip, init; 1932 ST incr; 1933 T chunk = pr->u.p.parm1; 1934 1935 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1936 gtid)); 1937 1938 init = chunk * pr->u.p.count++; 1939 trip = pr->u.p.tc - 1; 1940 1941 if ((status = (init <= trip)) == 0) { 1942 *p_lb = 0; 1943 *p_ub = 0; 1944 // if ( p_last != NULL ) 1945 // *p_last = 0; 1946 if (p_st != NULL) 1947 *p_st = 0; 1948 if (__kmp_env_consistency_check) { 1949 if (pr->pushed_ws != ct_none) { 1950 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1951 } 1952 } 1953 } else { 1954 start = pr->u.p.lb; 1955 limit = chunk + init - 1; 1956 incr = pr->u.p.st; 1957 1958 if ((last = (limit >= trip)) != 0) { 1959 limit = trip; 1960 #if KMP_OS_WINDOWS 1961 pr->u.p.last_upper = pr->u.p.ub; 1962 #endif /* KMP_OS_WINDOWS */ 1963 } 1964 if (p_last != NULL) 1965 *p_last = last; 1966 if (p_st != NULL) 1967 *p_st = incr; 1968 if (incr == 1) { 1969 *p_lb = start + init; 1970 *p_ub = start + limit; 1971 } else { 1972 *p_lb = start + init * incr; 1973 *p_ub = start + limit * incr; 1974 } 1975 1976 if (pr->flags.ordered) { 1977 pr->u.p.ordered_lower = init; 1978 pr->u.p.ordered_upper = limit; 1979 #ifdef KMP_DEBUG 1980 { 1981 char *buff; 1982 // create format specifiers before the debug output 1983 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1984 "ordered_lower:%%%s ordered_upper:%%%s\n", 1985 traits_t<UT>::spec, traits_t<UT>::spec); 1986 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1987 pr->u.p.ordered_upper)); 1988 __kmp_str_free(&buff); 1989 } 1990 #endif 1991 } // if 1992 } // if 1993 } else { 1994 pr->u.p.tc = 0; 1995 *p_lb = pr->u.p.lb; 1996 *p_ub = pr->u.p.ub; 1997 #if KMP_OS_WINDOWS 1998 pr->u.p.last_upper = *p_ub; 1999 #endif /* KMP_OS_WINDOWS */ 2000 if (p_last != NULL) 2001 *p_last = TRUE; 2002 if (p_st != NULL) 2003 *p_st = pr->u.p.st; 2004 } // if 2005 #ifdef KMP_DEBUG 2006 { 2007 char *buff; 2008 // create format specifiers before the debug output 2009 buff = __kmp_str_format( 2010 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 2011 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 2012 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2013 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status)); 2014 __kmp_str_free(&buff); 2015 } 2016 #endif 2017 #if INCLUDE_SSC_MARKS 2018 SSC_MARK_DISPATCH_NEXT(); 2019 #endif 2020 OMPT_LOOP_END; 2021 KMP_STATS_LOOP_END; 2022 return status; 2023 } else { 2024 kmp_int32 last = 0; 2025 dispatch_shared_info_template<T> volatile *sh; 2026 2027 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2028 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2029 2030 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2031 th->th.th_dispatch->th_dispatch_pr_current); 2032 KMP_DEBUG_ASSERT(pr); 2033 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2034 th->th.th_dispatch->th_dispatch_sh_current); 2035 KMP_DEBUG_ASSERT(sh); 2036 2037 #if KMP_USE_HIER_SCHED 2038 if (pr->flags.use_hier) 2039 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2040 else 2041 #endif // KMP_USE_HIER_SCHED 2042 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2043 p_st, th->th.th_team_nproc, 2044 th->th.th_info.ds.ds_tid); 2045 // status == 0: no more iterations to execute 2046 if (status == 0) { 2047 UT num_done; 2048 2049 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2050 #ifdef KMP_DEBUG 2051 { 2052 char *buff; 2053 // create format specifiers before the debug output 2054 buff = __kmp_str_format( 2055 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2056 traits_t<UT>::spec); 2057 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2058 __kmp_str_free(&buff); 2059 } 2060 #endif 2061 2062 #if KMP_USE_HIER_SCHED 2063 pr->flags.use_hier = FALSE; 2064 #endif 2065 if ((ST)num_done == th->th.th_team_nproc - 1) { 2066 #if (KMP_STATIC_STEAL_ENABLED) 2067 if (pr->schedule == kmp_sch_static_steal && 2068 traits_t<T>::type_size > 4) { 2069 int i; 2070 kmp_info_t **other_threads = team->t.t_threads; 2071 // loop complete, safe to destroy locks used for stealing 2072 for (i = 0; i < th->th.th_team_nproc; ++i) { 2073 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2074 KMP_ASSERT(lck != NULL); 2075 __kmp_destroy_lock(lck); 2076 __kmp_free(lck); 2077 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2078 } 2079 } 2080 #endif 2081 /* NOTE: release this buffer to be reused */ 2082 2083 KMP_MB(); /* Flush all pending memory write invalidates. */ 2084 2085 sh->u.s.num_done = 0; 2086 sh->u.s.iteration = 0; 2087 2088 /* TODO replace with general release procedure? */ 2089 if (pr->flags.ordered) { 2090 sh->u.s.ordered_iteration = 0; 2091 } 2092 2093 KMP_MB(); /* Flush all pending memory write invalidates. */ 2094 2095 sh->buffer_index += __kmp_dispatch_num_buffers; 2096 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2097 gtid, sh->buffer_index)); 2098 2099 KMP_MB(); /* Flush all pending memory write invalidates. */ 2100 2101 } // if 2102 if (__kmp_env_consistency_check) { 2103 if (pr->pushed_ws != ct_none) { 2104 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2105 } 2106 } 2107 2108 th->th.th_dispatch->th_deo_fcn = NULL; 2109 th->th.th_dispatch->th_dxo_fcn = NULL; 2110 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2111 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2112 } // if (status == 0) 2113 #if KMP_OS_WINDOWS 2114 else if (last) { 2115 pr->u.p.last_upper = pr->u.p.ub; 2116 } 2117 #endif /* KMP_OS_WINDOWS */ 2118 if (p_last != NULL && status != 0) 2119 *p_last = last; 2120 } // if 2121 2122 #ifdef KMP_DEBUG 2123 { 2124 char *buff; 2125 // create format specifiers before the debug output 2126 buff = __kmp_str_format( 2127 "__kmp_dispatch_next: T#%%d normal case: " 2128 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2129 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2130 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2131 (p_last ? *p_last : 0), status)); 2132 __kmp_str_free(&buff); 2133 } 2134 #endif 2135 #if INCLUDE_SSC_MARKS 2136 SSC_MARK_DISPATCH_NEXT(); 2137 #endif 2138 OMPT_LOOP_END; 2139 KMP_STATS_LOOP_END; 2140 return status; 2141 } 2142 2143 template <typename T> 2144 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2145 kmp_int32 *plastiter, T *plower, T *pupper, 2146 typename traits_t<T>::signed_t incr) { 2147 typedef typename traits_t<T>::unsigned_t UT; 2148 kmp_uint32 team_id; 2149 kmp_uint32 nteams; 2150 UT trip_count; 2151 kmp_team_t *team; 2152 kmp_info_t *th; 2153 2154 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2155 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2156 #ifdef KMP_DEBUG 2157 typedef typename traits_t<T>::signed_t ST; 2158 { 2159 char *buff; 2160 // create format specifiers before the debug output 2161 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2162 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2163 traits_t<T>::spec, traits_t<T>::spec, 2164 traits_t<ST>::spec, traits_t<T>::spec); 2165 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2166 __kmp_str_free(&buff); 2167 } 2168 #endif 2169 2170 if (__kmp_env_consistency_check) { 2171 if (incr == 0) { 2172 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2173 loc); 2174 } 2175 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2176 // The loop is illegal. 2177 // Some zero-trip loops maintained by compiler, e.g.: 2178 // for(i=10;i<0;++i) // lower >= upper - run-time check 2179 // for(i=0;i>10;--i) // lower <= upper - run-time check 2180 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2181 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2182 // Compiler does not check the following illegal loops: 2183 // for(i=0;i<10;i+=incr) // where incr<0 2184 // for(i=10;i>0;i-=incr) // where incr<0 2185 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2186 } 2187 } 2188 th = __kmp_threads[gtid]; 2189 team = th->th.th_team; 2190 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2191 nteams = th->th.th_teams_size.nteams; 2192 team_id = team->t.t_master_tid; 2193 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2194 2195 // compute global trip count 2196 if (incr == 1) { 2197 trip_count = *pupper - *plower + 1; 2198 } else if (incr == -1) { 2199 trip_count = *plower - *pupper + 1; 2200 } else if (incr > 0) { 2201 // upper-lower can exceed the limit of signed type 2202 trip_count = (UT)(*pupper - *plower) / incr + 1; 2203 } else { 2204 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2205 } 2206 2207 if (trip_count <= nteams) { 2208 KMP_DEBUG_ASSERT( 2209 __kmp_static == kmp_sch_static_greedy || 2210 __kmp_static == 2211 kmp_sch_static_balanced); // Unknown static scheduling type. 2212 // only some teams get single iteration, others get nothing 2213 if (team_id < trip_count) { 2214 *pupper = *plower = *plower + team_id * incr; 2215 } else { 2216 *plower = *pupper + incr; // zero-trip loop 2217 } 2218 if (plastiter != NULL) 2219 *plastiter = (team_id == trip_count - 1); 2220 } else { 2221 if (__kmp_static == kmp_sch_static_balanced) { 2222 UT chunk = trip_count / nteams; 2223 UT extras = trip_count % nteams; 2224 *plower += 2225 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2226 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2227 if (plastiter != NULL) 2228 *plastiter = (team_id == nteams - 1); 2229 } else { 2230 T chunk_inc_count = 2231 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2232 T upper = *pupper; 2233 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2234 // Unknown static scheduling type. 2235 *plower += team_id * chunk_inc_count; 2236 *pupper = *plower + chunk_inc_count - incr; 2237 // Check/correct bounds if needed 2238 if (incr > 0) { 2239 if (*pupper < *plower) 2240 *pupper = traits_t<T>::max_value; 2241 if (plastiter != NULL) 2242 *plastiter = *plower <= upper && *pupper > upper - incr; 2243 if (*pupper > upper) 2244 *pupper = upper; // tracker C73258 2245 } else { 2246 if (*pupper > *plower) 2247 *pupper = traits_t<T>::min_value; 2248 if (plastiter != NULL) 2249 *plastiter = *plower >= upper && *pupper < upper - incr; 2250 if (*pupper < upper) 2251 *pupper = upper; // tracker C73258 2252 } 2253 } 2254 } 2255 } 2256 2257 //----------------------------------------------------------------------------- 2258 // Dispatch routines 2259 // Transfer call to template< type T > 2260 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2261 // T lb, T ub, ST st, ST chunk ) 2262 extern "C" { 2263 2264 /*! 2265 @ingroup WORK_SHARING 2266 @{ 2267 @param loc Source location 2268 @param gtid Global thread id 2269 @param schedule Schedule type 2270 @param lb Lower bound 2271 @param ub Upper bound 2272 @param st Step (or increment if you prefer) 2273 @param chunk The chunk size to block with 2274 2275 This function prepares the runtime to start a dynamically scheduled for loop, 2276 saving the loop arguments. 2277 These functions are all identical apart from the types of the arguments. 2278 */ 2279 2280 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2281 enum sched_type schedule, kmp_int32 lb, 2282 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2283 KMP_DEBUG_ASSERT(__kmp_init_serial); 2284 #if OMPT_SUPPORT && OMPT_OPTIONAL 2285 OMPT_STORE_RETURN_ADDRESS(gtid); 2286 #endif 2287 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2288 } 2289 /*! 2290 See @ref __kmpc_dispatch_init_4 2291 */ 2292 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2293 enum sched_type schedule, kmp_uint32 lb, 2294 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2295 KMP_DEBUG_ASSERT(__kmp_init_serial); 2296 #if OMPT_SUPPORT && OMPT_OPTIONAL 2297 OMPT_STORE_RETURN_ADDRESS(gtid); 2298 #endif 2299 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2300 } 2301 2302 /*! 2303 See @ref __kmpc_dispatch_init_4 2304 */ 2305 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2306 enum sched_type schedule, kmp_int64 lb, 2307 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2308 KMP_DEBUG_ASSERT(__kmp_init_serial); 2309 #if OMPT_SUPPORT && OMPT_OPTIONAL 2310 OMPT_STORE_RETURN_ADDRESS(gtid); 2311 #endif 2312 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2313 } 2314 2315 /*! 2316 See @ref __kmpc_dispatch_init_4 2317 */ 2318 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2319 enum sched_type schedule, kmp_uint64 lb, 2320 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2321 KMP_DEBUG_ASSERT(__kmp_init_serial); 2322 #if OMPT_SUPPORT && OMPT_OPTIONAL 2323 OMPT_STORE_RETURN_ADDRESS(gtid); 2324 #endif 2325 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2326 } 2327 2328 /*! 2329 See @ref __kmpc_dispatch_init_4 2330 2331 Difference from __kmpc_dispatch_init set of functions is these functions 2332 are called for composite distribute parallel for construct. Thus before 2333 regular iterations dispatching we need to calc per-team iteration space. 2334 2335 These functions are all identical apart from the types of the arguments. 2336 */ 2337 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2338 enum sched_type schedule, kmp_int32 *p_last, 2339 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2340 kmp_int32 chunk) { 2341 KMP_DEBUG_ASSERT(__kmp_init_serial); 2342 #if OMPT_SUPPORT && OMPT_OPTIONAL 2343 OMPT_STORE_RETURN_ADDRESS(gtid); 2344 #endif 2345 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2346 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2347 } 2348 2349 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2350 enum sched_type schedule, kmp_int32 *p_last, 2351 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2352 kmp_int32 chunk) { 2353 KMP_DEBUG_ASSERT(__kmp_init_serial); 2354 #if OMPT_SUPPORT && OMPT_OPTIONAL 2355 OMPT_STORE_RETURN_ADDRESS(gtid); 2356 #endif 2357 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2358 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2359 } 2360 2361 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2362 enum sched_type schedule, kmp_int32 *p_last, 2363 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2364 kmp_int64 chunk) { 2365 KMP_DEBUG_ASSERT(__kmp_init_serial); 2366 #if OMPT_SUPPORT && OMPT_OPTIONAL 2367 OMPT_STORE_RETURN_ADDRESS(gtid); 2368 #endif 2369 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2370 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2371 } 2372 2373 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2374 enum sched_type schedule, kmp_int32 *p_last, 2375 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2376 kmp_int64 chunk) { 2377 KMP_DEBUG_ASSERT(__kmp_init_serial); 2378 #if OMPT_SUPPORT && OMPT_OPTIONAL 2379 OMPT_STORE_RETURN_ADDRESS(gtid); 2380 #endif 2381 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2382 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2383 } 2384 2385 /*! 2386 @param loc Source code location 2387 @param gtid Global thread id 2388 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2389 otherwise 2390 @param p_lb Pointer to the lower bound for the next chunk of work 2391 @param p_ub Pointer to the upper bound for the next chunk of work 2392 @param p_st Pointer to the stride for the next chunk of work 2393 @return one if there is work to be done, zero otherwise 2394 2395 Get the next dynamically allocated chunk of work for this thread. 2396 If there is no more work, then the lb,ub and stride need not be modified. 2397 */ 2398 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2399 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2400 #if OMPT_SUPPORT && OMPT_OPTIONAL 2401 OMPT_STORE_RETURN_ADDRESS(gtid); 2402 #endif 2403 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2404 #if OMPT_SUPPORT && OMPT_OPTIONAL 2405 , 2406 OMPT_LOAD_RETURN_ADDRESS(gtid) 2407 #endif 2408 ); 2409 } 2410 2411 /*! 2412 See @ref __kmpc_dispatch_next_4 2413 */ 2414 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2415 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2416 kmp_int32 *p_st) { 2417 #if OMPT_SUPPORT && OMPT_OPTIONAL 2418 OMPT_STORE_RETURN_ADDRESS(gtid); 2419 #endif 2420 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2421 #if OMPT_SUPPORT && OMPT_OPTIONAL 2422 , 2423 OMPT_LOAD_RETURN_ADDRESS(gtid) 2424 #endif 2425 ); 2426 } 2427 2428 /*! 2429 See @ref __kmpc_dispatch_next_4 2430 */ 2431 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2432 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2433 #if OMPT_SUPPORT && OMPT_OPTIONAL 2434 OMPT_STORE_RETURN_ADDRESS(gtid); 2435 #endif 2436 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2437 #if OMPT_SUPPORT && OMPT_OPTIONAL 2438 , 2439 OMPT_LOAD_RETURN_ADDRESS(gtid) 2440 #endif 2441 ); 2442 } 2443 2444 /*! 2445 See @ref __kmpc_dispatch_next_4 2446 */ 2447 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2448 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2449 kmp_int64 *p_st) { 2450 #if OMPT_SUPPORT && OMPT_OPTIONAL 2451 OMPT_STORE_RETURN_ADDRESS(gtid); 2452 #endif 2453 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2454 #if OMPT_SUPPORT && OMPT_OPTIONAL 2455 , 2456 OMPT_LOAD_RETURN_ADDRESS(gtid) 2457 #endif 2458 ); 2459 } 2460 2461 /*! 2462 @param loc Source code location 2463 @param gtid Global thread id 2464 2465 Mark the end of a dynamic loop. 2466 */ 2467 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2468 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2469 } 2470 2471 /*! 2472 See @ref __kmpc_dispatch_fini_4 2473 */ 2474 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2475 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2476 } 2477 2478 /*! 2479 See @ref __kmpc_dispatch_fini_4 2480 */ 2481 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2482 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2483 } 2484 2485 /*! 2486 See @ref __kmpc_dispatch_fini_4 2487 */ 2488 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2489 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2490 } 2491 /*! @} */ 2492 2493 //----------------------------------------------------------------------------- 2494 // Non-template routines from kmp_dispatch.cpp used in other sources 2495 2496 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2497 return value == checker; 2498 } 2499 2500 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2501 return value != checker; 2502 } 2503 2504 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2505 return value < checker; 2506 } 2507 2508 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2509 return value >= checker; 2510 } 2511 2512 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2513 return value <= checker; 2514 } 2515 2516 kmp_uint32 2517 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2518 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2519 void *obj // Higher-level synchronization object, or NULL. 2520 ) { 2521 // note: we may not belong to a team at this point 2522 volatile kmp_uint32 *spin = spinner; 2523 kmp_uint32 check = checker; 2524 kmp_uint32 spins; 2525 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2526 kmp_uint32 r; 2527 2528 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2529 KMP_INIT_YIELD(spins); 2530 // main wait spin loop 2531 while (!f(r = TCR_4(*spin), check)) { 2532 KMP_FSYNC_SPIN_PREPARE(obj); 2533 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2534 split. It causes problems with infinite recursion because of exit lock */ 2535 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2536 __kmp_abort_thread(); */ 2537 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2538 } 2539 KMP_FSYNC_SPIN_ACQUIRED(obj); 2540 return r; 2541 } 2542 2543 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, 2544 kmp_uint32 (*pred)(void *, kmp_uint32), 2545 void *obj // Higher-level synchronization object, or NULL. 2546 ) { 2547 // note: we may not belong to a team at this point 2548 void *spin = spinner; 2549 kmp_uint32 check = checker; 2550 kmp_uint32 spins; 2551 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2552 2553 KMP_FSYNC_SPIN_INIT(obj, spin); 2554 KMP_INIT_YIELD(spins); 2555 // main wait spin loop 2556 while (!f(spin, check)) { 2557 KMP_FSYNC_SPIN_PREPARE(obj); 2558 /* if we have waited a bit, or are noversubscribed, yield */ 2559 /* pause is in the following code */ 2560 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2561 } 2562 KMP_FSYNC_SPIN_ACQUIRED(obj); 2563 } 2564 2565 } // extern "C" 2566 2567 #ifdef KMP_GOMP_COMPAT 2568 2569 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2570 enum sched_type schedule, kmp_int32 lb, 2571 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2572 int push_ws) { 2573 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2574 push_ws); 2575 } 2576 2577 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2578 enum sched_type schedule, kmp_uint32 lb, 2579 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2580 int push_ws) { 2581 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2582 push_ws); 2583 } 2584 2585 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2586 enum sched_type schedule, kmp_int64 lb, 2587 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2588 int push_ws) { 2589 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2590 push_ws); 2591 } 2592 2593 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2594 enum sched_type schedule, kmp_uint64 lb, 2595 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2596 int push_ws) { 2597 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2598 push_ws); 2599 } 2600 2601 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2602 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2603 } 2604 2605 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2606 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2607 } 2608 2609 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2610 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2611 } 2612 2613 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2614 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2615 } 2616 2617 #endif /* KMP_GOMP_COMPAT */ 2618 2619 /* ------------------------------------------------------------------------ */ 2620