1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 /* Dynamic scheduling initialization and dispatch. 14 * 15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 16 * it may change values between parallel regions. __kmp_max_nth 17 * is the largest value __kmp_nth may take, 1 is the smallest. 18 */ 19 20 #include "kmp.h" 21 #include "kmp_error.h" 22 #include "kmp_i18n.h" 23 #include "kmp_itt.h" 24 #include "kmp_stats.h" 25 #include "kmp_str.h" 26 #if KMP_USE_X87CONTROL 27 #include <float.h> 28 #endif 29 #include "kmp_lock.h" 30 #include "kmp_dispatch.h" 31 #if KMP_USE_HIER_SCHED 32 #include "kmp_dispatch_hier.h" 33 #endif 34 35 #if OMPT_SUPPORT 36 #include "ompt-specific.h" 37 #endif 38 39 /* ------------------------------------------------------------------------ */ 40 /* ------------------------------------------------------------------------ */ 41 42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 43 kmp_info_t *th; 44 45 KMP_DEBUG_ASSERT(gtid_ref); 46 47 if (__kmp_env_consistency_check) { 48 th = __kmp_threads[*gtid_ref]; 49 if (th->th.th_root->r.r_active && 50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 51 #if KMP_USE_DYNAMIC_LOCK 52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 53 #else 54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 55 #endif 56 } 57 } 58 } 59 60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 61 kmp_info_t *th; 62 63 if (__kmp_env_consistency_check) { 64 th = __kmp_threads[*gtid_ref]; 65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 67 } 68 } 69 } 70 71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC 72 static inline int __kmp_get_monotonicity(enum sched_type schedule, 73 bool use_hier = false) { 74 // Pick up the nonmonotonic/monotonic bits from the scheduling type 75 int monotonicity; 76 // default to monotonic 77 monotonicity = SCHEDULE_MONOTONIC; 78 if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 79 monotonicity = SCHEDULE_NONMONOTONIC; 80 else if (SCHEDULE_HAS_MONOTONIC(schedule)) 81 monotonicity = SCHEDULE_MONOTONIC; 82 return monotonicity; 83 } 84 85 // Initialize a dispatch_private_info_template<T> buffer for a particular 86 // type of schedule,chunk. The loop description is found in lb (lower bound), 87 // ub (upper bound), and st (stride). nproc is the number of threads relevant 88 // to the scheduling (often the number of threads in a team, but not always if 89 // hierarchical scheduling is used). tid is the id of the thread calling 90 // the function within the group of nproc threads. It will have a value 91 // between 0 and nproc - 1. This is often just the thread id within a team, but 92 // is not necessarily the case when using hierarchical scheduling. 93 // loc is the source file location of the corresponding loop 94 // gtid is the global thread id 95 template <typename T> 96 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 97 dispatch_private_info_template<T> *pr, 98 enum sched_type schedule, T lb, T ub, 99 typename traits_t<T>::signed_t st, 100 #if USE_ITT_BUILD 101 kmp_uint64 *cur_chunk, 102 #endif 103 typename traits_t<T>::signed_t chunk, 104 T nproc, T tid) { 105 typedef typename traits_t<T>::unsigned_t UT; 106 typedef typename traits_t<T>::floating_t DBL; 107 108 int active; 109 T tc; 110 kmp_info_t *th; 111 kmp_team_t *team; 112 int monotonicity; 113 bool use_hier; 114 115 #ifdef KMP_DEBUG 116 typedef typename traits_t<T>::signed_t ST; 117 { 118 char *buff; 119 // create format specifiers before the debug output 120 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 121 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 122 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 123 traits_t<T>::spec, traits_t<T>::spec, 124 traits_t<ST>::spec, traits_t<ST>::spec, 125 traits_t<T>::spec, traits_t<T>::spec); 126 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 127 __kmp_str_free(&buff); 128 } 129 #endif 130 /* setup data */ 131 th = __kmp_threads[gtid]; 132 team = th->th.th_team; 133 active = !team->t.t_serialized; 134 135 #if USE_ITT_BUILD 136 int itt_need_metadata_reporting = 137 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 138 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 139 team->t.t_active_level == 1; 140 #endif 141 142 #if KMP_USE_HIER_SCHED 143 use_hier = pr->flags.use_hier; 144 #else 145 use_hier = false; 146 #endif 147 148 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */ 149 monotonicity = __kmp_get_monotonicity(schedule, use_hier); 150 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 151 152 /* Pick up the nomerge/ordered bits from the scheduling type */ 153 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 154 pr->flags.nomerge = TRUE; 155 schedule = 156 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 157 } else { 158 pr->flags.nomerge = FALSE; 159 } 160 pr->type_size = traits_t<T>::type_size; // remember the size of variables 161 if (kmp_ord_lower & schedule) { 162 pr->flags.ordered = TRUE; 163 schedule = 164 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 165 } else { 166 pr->flags.ordered = FALSE; 167 } 168 // Ordered overrides nonmonotonic 169 if (pr->flags.ordered) { 170 monotonicity = SCHEDULE_MONOTONIC; 171 } 172 173 if (schedule == kmp_sch_static) { 174 schedule = __kmp_static; 175 } else { 176 if (schedule == kmp_sch_runtime) { 177 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 178 // not specified) 179 schedule = team->t.t_sched.r_sched_type; 180 monotonicity = __kmp_get_monotonicity(schedule, use_hier); 181 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 182 // Detail the schedule if needed (global controls are differentiated 183 // appropriately) 184 if (schedule == kmp_sch_guided_chunked) { 185 schedule = __kmp_guided; 186 } else if (schedule == kmp_sch_static) { 187 schedule = __kmp_static; 188 } 189 // Use the chunk size specified by OMP_SCHEDULE (or default if not 190 // specified) 191 chunk = team->t.t_sched.chunk; 192 #if USE_ITT_BUILD 193 if (cur_chunk) 194 *cur_chunk = chunk; 195 #endif 196 #ifdef KMP_DEBUG 197 { 198 char *buff; 199 // create format specifiers before the debug output 200 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 201 "schedule:%%d chunk:%%%s\n", 202 traits_t<ST>::spec); 203 KD_TRACE(10, (buff, gtid, schedule, chunk)); 204 __kmp_str_free(&buff); 205 } 206 #endif 207 } else { 208 if (schedule == kmp_sch_guided_chunked) { 209 schedule = __kmp_guided; 210 } 211 if (chunk <= 0) { 212 chunk = KMP_DEFAULT_CHUNK; 213 } 214 } 215 216 if (schedule == kmp_sch_auto) { 217 // mapping and differentiation: in the __kmp_do_serial_initialize() 218 schedule = __kmp_auto; 219 #ifdef KMP_DEBUG 220 { 221 char *buff; 222 // create format specifiers before the debug output 223 buff = __kmp_str_format( 224 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 225 "schedule:%%d chunk:%%%s\n", 226 traits_t<ST>::spec); 227 KD_TRACE(10, (buff, gtid, schedule, chunk)); 228 __kmp_str_free(&buff); 229 } 230 #endif 231 } 232 #if KMP_STATIC_STEAL_ENABLED 233 // map nonmonotonic:dynamic to static steal 234 if (schedule == kmp_sch_dynamic_chunked) { 235 if (monotonicity == SCHEDULE_NONMONOTONIC) 236 schedule = kmp_sch_static_steal; 237 } 238 #endif 239 /* guided analytical not safe for too many threads */ 240 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 241 schedule = kmp_sch_guided_iterative_chunked; 242 KMP_WARNING(DispatchManyThreads); 243 } 244 if (schedule == kmp_sch_runtime_simd) { 245 // compiler provides simd_width in the chunk parameter 246 schedule = team->t.t_sched.r_sched_type; 247 monotonicity = __kmp_get_monotonicity(schedule, use_hier); 248 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 249 // Detail the schedule if needed (global controls are differentiated 250 // appropriately) 251 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 252 schedule == __kmp_static) { 253 schedule = kmp_sch_static_balanced_chunked; 254 } else { 255 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 256 schedule = kmp_sch_guided_simd; 257 } 258 chunk = team->t.t_sched.chunk * chunk; 259 } 260 #if USE_ITT_BUILD 261 if (cur_chunk) 262 *cur_chunk = chunk; 263 #endif 264 #ifdef KMP_DEBUG 265 { 266 char *buff; 267 // create format specifiers before the debug output 268 buff = __kmp_str_format( 269 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d" 270 " chunk:%%%s\n", 271 traits_t<ST>::spec); 272 KD_TRACE(10, (buff, gtid, schedule, chunk)); 273 __kmp_str_free(&buff); 274 } 275 #endif 276 } 277 pr->u.p.parm1 = chunk; 278 } 279 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 280 "unknown scheduling type"); 281 282 pr->u.p.count = 0; 283 284 if (__kmp_env_consistency_check) { 285 if (st == 0) { 286 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 287 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 288 } 289 } 290 // compute trip count 291 if (st == 1) { // most common case 292 if (ub >= lb) { 293 tc = ub - lb + 1; 294 } else { // ub < lb 295 tc = 0; // zero-trip 296 } 297 } else if (st < 0) { 298 if (lb >= ub) { 299 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 300 // where the division needs to be unsigned regardless of the result type 301 tc = (UT)(lb - ub) / (-st) + 1; 302 } else { // lb < ub 303 tc = 0; // zero-trip 304 } 305 } else { // st > 0 306 if (ub >= lb) { 307 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 308 // where the division needs to be unsigned regardless of the result type 309 tc = (UT)(ub - lb) / st + 1; 310 } else { // ub < lb 311 tc = 0; // zero-trip 312 } 313 } 314 315 #if KMP_STATS_ENABLED 316 if (KMP_MASTER_GTID(gtid)) { 317 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc); 318 } 319 #endif 320 321 pr->u.p.lb = lb; 322 pr->u.p.ub = ub; 323 pr->u.p.st = st; 324 pr->u.p.tc = tc; 325 326 #if KMP_OS_WINDOWS 327 pr->u.p.last_upper = ub + st; 328 #endif /* KMP_OS_WINDOWS */ 329 330 /* NOTE: only the active parallel region(s) has active ordered sections */ 331 332 if (active) { 333 if (pr->flags.ordered) { 334 pr->ordered_bumped = 0; 335 pr->u.p.ordered_lower = 1; 336 pr->u.p.ordered_upper = 0; 337 } 338 } 339 340 switch (schedule) { 341 #if (KMP_STATIC_STEAL_ENABLED) 342 case kmp_sch_static_steal: { 343 T ntc, init; 344 345 KD_TRACE(100, 346 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 347 gtid)); 348 349 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 350 if (nproc > 1 && ntc >= nproc) { 351 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 352 T id = tid; 353 T small_chunk, extras; 354 355 small_chunk = ntc / nproc; 356 extras = ntc % nproc; 357 358 init = id * small_chunk + (id < extras ? id : extras); 359 pr->u.p.count = init; 360 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 361 362 pr->u.p.parm2 = lb; 363 // parm3 is the number of times to attempt stealing which is 364 // proportional to the number of chunks per thread up until 365 // the maximum value of nproc. 366 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc); 367 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 368 pr->u.p.st = st; 369 if (traits_t<T>::type_size > 4) { 370 // AC: TODO: check if 16-byte CAS available and use it to 371 // improve performance (probably wait for explicit request 372 // before spending time on this). 373 // For now use dynamically allocated per-thread lock, 374 // free memory in __kmp_dispatch_next when status==0. 375 KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL); 376 pr->u.p.th_steal_lock = 377 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 378 __kmp_init_lock(pr->u.p.th_steal_lock); 379 } 380 break; 381 } else { 382 /* too few chunks: switching to kmp_sch_dynamic_chunked */ 383 schedule = kmp_sch_dynamic_chunked; 384 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to " 385 "kmp_sch_dynamic_chunked\n", 386 gtid)); 387 if (pr->u.p.parm1 <= 0) 388 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 389 break; 390 } // if 391 } // case 392 #endif 393 case kmp_sch_static_balanced: { 394 T init, limit; 395 396 KD_TRACE( 397 100, 398 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 399 gtid)); 400 401 if (nproc > 1) { 402 T id = tid; 403 404 if (tc < nproc) { 405 if (id < tc) { 406 init = id; 407 limit = id; 408 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 409 } else { 410 pr->u.p.count = 1; /* means no more chunks to execute */ 411 pr->u.p.parm1 = FALSE; 412 break; 413 } 414 } else { 415 T small_chunk = tc / nproc; 416 T extras = tc % nproc; 417 init = id * small_chunk + (id < extras ? id : extras); 418 limit = init + small_chunk - (id < extras ? 0 : 1); 419 pr->u.p.parm1 = (id == nproc - 1); 420 } 421 } else { 422 if (tc > 0) { 423 init = 0; 424 limit = tc - 1; 425 pr->u.p.parm1 = TRUE; 426 } else { 427 // zero trip count 428 pr->u.p.count = 1; /* means no more chunks to execute */ 429 pr->u.p.parm1 = FALSE; 430 break; 431 } 432 } 433 #if USE_ITT_BUILD 434 // Calculate chunk for metadata report 435 if (itt_need_metadata_reporting) 436 if (cur_chunk) 437 *cur_chunk = limit - init + 1; 438 #endif 439 if (st == 1) { 440 pr->u.p.lb = lb + init; 441 pr->u.p.ub = lb + limit; 442 } else { 443 // calculated upper bound, "ub" is user-defined upper bound 444 T ub_tmp = lb + limit * st; 445 pr->u.p.lb = lb + init * st; 446 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 447 // it exactly 448 if (st > 0) { 449 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 450 } else { 451 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 452 } 453 } 454 if (pr->flags.ordered) { 455 pr->u.p.ordered_lower = init; 456 pr->u.p.ordered_upper = limit; 457 } 458 break; 459 } // case 460 case kmp_sch_static_balanced_chunked: { 461 // similar to balanced, but chunk adjusted to multiple of simd width 462 T nth = nproc; 463 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 464 " -> falling-through to static_greedy\n", 465 gtid)); 466 schedule = kmp_sch_static_greedy; 467 if (nth > 1) 468 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 469 else 470 pr->u.p.parm1 = tc; 471 break; 472 } // case 473 case kmp_sch_guided_simd: 474 case kmp_sch_guided_iterative_chunked: { 475 KD_TRACE( 476 100, 477 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 478 " case\n", 479 gtid)); 480 481 if (nproc > 1) { 482 if ((2L * chunk + 1) * nproc >= tc) { 483 /* chunk size too large, switch to dynamic */ 484 schedule = kmp_sch_dynamic_chunked; 485 } else { 486 // when remaining iters become less than parm2 - switch to dynamic 487 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 488 *(double *)&pr->u.p.parm3 = 489 guided_flt_param / nproc; // may occupy parm3 and parm4 490 } 491 } else { 492 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 493 "kmp_sch_static_greedy\n", 494 gtid)); 495 schedule = kmp_sch_static_greedy; 496 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 497 KD_TRACE( 498 100, 499 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 500 gtid)); 501 pr->u.p.parm1 = tc; 502 } // if 503 } // case 504 break; 505 case kmp_sch_guided_analytical_chunked: { 506 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 507 "kmp_sch_guided_analytical_chunked case\n", 508 gtid)); 509 510 if (nproc > 1) { 511 if ((2L * chunk + 1) * nproc >= tc) { 512 /* chunk size too large, switch to dynamic */ 513 schedule = kmp_sch_dynamic_chunked; 514 } else { 515 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 516 DBL x; 517 518 #if KMP_USE_X87CONTROL 519 /* Linux* OS already has 64-bit computation by default for long double, 520 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 521 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 522 instead of the default 53-bit. Even though long double doesn't work 523 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 524 expected to impact the correctness of the algorithm, but this has not 525 been mathematically proven. */ 526 // save original FPCW and set precision to 64-bit, as 527 // Windows* OS on IA-32 architecture defaults to 53-bit 528 unsigned int oldFpcw = _control87(0, 0); 529 _control87(_PC_64, _MCW_PC); // 0,0x30000 530 #endif 531 /* value used for comparison in solver for cross-over point */ 532 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 533 534 /* crossover point--chunk indexes equal to or greater than 535 this point switch to dynamic-style scheduling */ 536 UT cross; 537 538 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 539 x = (long double)1.0 - (long double)0.5 / nproc; 540 541 #ifdef KMP_DEBUG 542 { // test natural alignment 543 struct _test_a { 544 char a; 545 union { 546 char b; 547 DBL d; 548 }; 549 } t; 550 ptrdiff_t natural_alignment = 551 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 552 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 553 // long)natural_alignment ); 554 KMP_DEBUG_ASSERT( 555 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 556 } 557 #endif // KMP_DEBUG 558 559 /* save the term in thread private dispatch structure */ 560 *(DBL *)&pr->u.p.parm3 = x; 561 562 /* solve for the crossover point to the nearest integer i for which C_i 563 <= chunk */ 564 { 565 UT left, right, mid; 566 long double p; 567 568 /* estimate initial upper and lower bound */ 569 570 /* doesn't matter what value right is as long as it is positive, but 571 it affects performance of the solver */ 572 right = 229; 573 p = __kmp_pow<UT>(x, right); 574 if (p > target) { 575 do { 576 p *= p; 577 right <<= 1; 578 } while (p > target && right < (1 << 27)); 579 /* lower bound is previous (failed) estimate of upper bound */ 580 left = right >> 1; 581 } else { 582 left = 0; 583 } 584 585 /* bisection root-finding method */ 586 while (left + 1 < right) { 587 mid = (left + right) / 2; 588 if (__kmp_pow<UT>(x, mid) > target) { 589 left = mid; 590 } else { 591 right = mid; 592 } 593 } // while 594 cross = right; 595 } 596 /* assert sanity of computed crossover point */ 597 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 598 __kmp_pow<UT>(x, cross) <= target); 599 600 /* save the crossover point in thread private dispatch structure */ 601 pr->u.p.parm2 = cross; 602 603 // C75803 604 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 605 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 606 #else 607 #define GUIDED_ANALYTICAL_WORKAROUND (x) 608 #endif 609 /* dynamic-style scheduling offset */ 610 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 611 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 612 cross * chunk; 613 #if KMP_USE_X87CONTROL 614 // restore FPCW 615 _control87(oldFpcw, _MCW_PC); 616 #endif 617 } // if 618 } else { 619 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 620 "kmp_sch_static_greedy\n", 621 gtid)); 622 schedule = kmp_sch_static_greedy; 623 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 624 pr->u.p.parm1 = tc; 625 } // if 626 } // case 627 break; 628 case kmp_sch_static_greedy: 629 KD_TRACE( 630 100, 631 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 632 gtid)); 633 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 634 break; 635 case kmp_sch_static_chunked: 636 case kmp_sch_dynamic_chunked: 637 if (pr->u.p.parm1 <= 0) { 638 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 639 } 640 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 641 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 642 gtid)); 643 break; 644 case kmp_sch_trapezoidal: { 645 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 646 647 T parm1, parm2, parm3, parm4; 648 KD_TRACE(100, 649 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 650 gtid)); 651 652 parm1 = chunk; 653 654 /* F : size of the first cycle */ 655 parm2 = (tc / (2 * nproc)); 656 657 if (parm2 < 1) { 658 parm2 = 1; 659 } 660 661 /* L : size of the last cycle. Make sure the last cycle is not larger 662 than the first cycle. */ 663 if (parm1 < 1) { 664 parm1 = 1; 665 } else if (parm1 > parm2) { 666 parm1 = parm2; 667 } 668 669 /* N : number of cycles */ 670 parm3 = (parm2 + parm1); 671 parm3 = (2 * tc + parm3 - 1) / parm3; 672 673 if (parm3 < 2) { 674 parm3 = 2; 675 } 676 677 /* sigma : decreasing incr of the trapezoid */ 678 parm4 = (parm3 - 1); 679 parm4 = (parm2 - parm1) / parm4; 680 681 // pointless check, because parm4 >= 0 always 682 // if ( parm4 < 0 ) { 683 // parm4 = 0; 684 //} 685 686 pr->u.p.parm1 = parm1; 687 pr->u.p.parm2 = parm2; 688 pr->u.p.parm3 = parm3; 689 pr->u.p.parm4 = parm4; 690 } // case 691 break; 692 693 default: { 694 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 695 KMP_HNT(GetNewerLibrary), // Hint 696 __kmp_msg_null // Variadic argument list terminator 697 ); 698 } break; 699 } // switch 700 pr->schedule = schedule; 701 } 702 703 #if KMP_USE_HIER_SCHED 704 template <typename T> 705 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 706 typename traits_t<T>::signed_t st); 707 template <> 708 inline void 709 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 710 kmp_int32 ub, kmp_int32 st) { 711 __kmp_dispatch_init_hierarchy<kmp_int32>( 712 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 713 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 714 } 715 template <> 716 inline void 717 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 718 kmp_uint32 ub, kmp_int32 st) { 719 __kmp_dispatch_init_hierarchy<kmp_uint32>( 720 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 721 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 722 } 723 template <> 724 inline void 725 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 726 kmp_int64 ub, kmp_int64 st) { 727 __kmp_dispatch_init_hierarchy<kmp_int64>( 728 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 729 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 730 } 731 template <> 732 inline void 733 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 734 kmp_uint64 ub, kmp_int64 st) { 735 __kmp_dispatch_init_hierarchy<kmp_uint64>( 736 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 737 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 738 } 739 740 // free all the hierarchy scheduling memory associated with the team 741 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 742 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 743 for (int i = 0; i < num_disp_buff; ++i) { 744 // type does not matter here so use kmp_int32 745 auto sh = 746 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 747 &team->t.t_disp_buffer[i]); 748 if (sh->hier) { 749 sh->hier->deallocate(); 750 __kmp_free(sh->hier); 751 } 752 } 753 } 754 #endif 755 756 // UT - unsigned flavor of T, ST - signed flavor of T, 757 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 758 template <typename T> 759 static void 760 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 761 T ub, typename traits_t<T>::signed_t st, 762 typename traits_t<T>::signed_t chunk, int push_ws) { 763 typedef typename traits_t<T>::unsigned_t UT; 764 765 int active; 766 kmp_info_t *th; 767 kmp_team_t *team; 768 kmp_uint32 my_buffer_index; 769 dispatch_private_info_template<T> *pr; 770 dispatch_shared_info_template<T> volatile *sh; 771 772 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 773 sizeof(dispatch_private_info)); 774 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 775 sizeof(dispatch_shared_info)); 776 777 if (!TCR_4(__kmp_init_parallel)) 778 __kmp_parallel_initialize(); 779 780 __kmp_resume_if_soft_paused(); 781 782 #if INCLUDE_SSC_MARKS 783 SSC_MARK_DISPATCH_INIT(); 784 #endif 785 #ifdef KMP_DEBUG 786 typedef typename traits_t<T>::signed_t ST; 787 { 788 char *buff; 789 // create format specifiers before the debug output 790 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 791 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 792 traits_t<ST>::spec, traits_t<T>::spec, 793 traits_t<T>::spec, traits_t<ST>::spec); 794 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 795 __kmp_str_free(&buff); 796 } 797 #endif 798 /* setup data */ 799 th = __kmp_threads[gtid]; 800 team = th->th.th_team; 801 active = !team->t.t_serialized; 802 th->th.th_ident = loc; 803 804 // Any half-decent optimizer will remove this test when the blocks are empty 805 // since the macros expand to nothing 806 // when statistics are disabled. 807 if (schedule == __kmp_static) { 808 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 809 } else { 810 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 811 } 812 813 #if KMP_USE_HIER_SCHED 814 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 815 // Hierarchical scheduling does not work with ordered, so if ordered is 816 // detected, then revert back to threaded scheduling. 817 bool ordered; 818 enum sched_type my_sched = schedule; 819 my_buffer_index = th->th.th_dispatch->th_disp_index; 820 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 821 &th->th.th_dispatch 822 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 823 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 824 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 825 my_sched = 826 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 827 ordered = (kmp_ord_lower & my_sched); 828 if (pr->flags.use_hier) { 829 if (ordered) { 830 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 831 "Disabling hierarchical scheduling.\n", 832 gtid)); 833 pr->flags.use_hier = FALSE; 834 } 835 } 836 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 837 // Don't use hierarchical for ordered parallel loops and don't 838 // use the runtime hierarchy if one was specified in the program 839 if (!ordered && !pr->flags.use_hier) 840 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 841 } 842 #endif // KMP_USE_HIER_SCHED 843 844 #if USE_ITT_BUILD 845 kmp_uint64 cur_chunk = chunk; 846 int itt_need_metadata_reporting = 847 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 848 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 849 team->t.t_active_level == 1; 850 #endif 851 if (!active) { 852 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 853 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 854 } else { 855 KMP_DEBUG_ASSERT(th->th.th_dispatch == 856 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 857 858 my_buffer_index = th->th.th_dispatch->th_disp_index++; 859 860 /* What happens when number of threads changes, need to resize buffer? */ 861 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 862 &th->th.th_dispatch 863 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 864 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 865 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 866 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 867 my_buffer_index)); 868 } 869 870 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 871 #if USE_ITT_BUILD 872 &cur_chunk, 873 #endif 874 chunk, (T)th->th.th_team_nproc, 875 (T)th->th.th_info.ds.ds_tid); 876 if (active) { 877 if (pr->flags.ordered == 0) { 878 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 879 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 880 } else { 881 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 882 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 883 } 884 } 885 886 if (active) { 887 /* The name of this buffer should be my_buffer_index when it's free to use 888 * it */ 889 890 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 891 "sh->buffer_index:%d\n", 892 gtid, my_buffer_index, sh->buffer_index)); 893 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, 894 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 895 // Note: KMP_WAIT() cannot be used there: buffer index and 896 // my_buffer_index are *always* 32-bit integers. 897 KMP_MB(); /* is this necessary? */ 898 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 899 "sh->buffer_index:%d\n", 900 gtid, my_buffer_index, sh->buffer_index)); 901 902 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 903 th->th.th_dispatch->th_dispatch_sh_current = 904 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 905 #if USE_ITT_BUILD 906 if (pr->flags.ordered) { 907 __kmp_itt_ordered_init(gtid); 908 } 909 // Report loop metadata 910 if (itt_need_metadata_reporting) { 911 // Only report metadata by master of active team at level 1 912 kmp_uint64 schedtype = 0; 913 switch (schedule) { 914 case kmp_sch_static_chunked: 915 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 916 break; 917 case kmp_sch_static_greedy: 918 cur_chunk = pr->u.p.parm1; 919 break; 920 case kmp_sch_dynamic_chunked: 921 schedtype = 1; 922 break; 923 case kmp_sch_guided_iterative_chunked: 924 case kmp_sch_guided_analytical_chunked: 925 case kmp_sch_guided_simd: 926 schedtype = 2; 927 break; 928 default: 929 // Should we put this case under "static"? 930 // case kmp_sch_static_steal: 931 schedtype = 3; 932 break; 933 } 934 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 935 } 936 #if KMP_USE_HIER_SCHED 937 if (pr->flags.use_hier) { 938 pr->u.p.count = 0; 939 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 940 } 941 #endif // KMP_USER_HIER_SCHED 942 #endif /* USE_ITT_BUILD */ 943 } 944 945 #ifdef KMP_DEBUG 946 { 947 char *buff; 948 // create format specifiers before the debug output 949 buff = __kmp_str_format( 950 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 951 "lb:%%%s ub:%%%s" 952 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 953 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 954 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 955 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 956 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 957 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 958 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 959 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 960 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 961 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 962 __kmp_str_free(&buff); 963 } 964 #endif 965 #if (KMP_STATIC_STEAL_ENABLED) 966 // It cannot be guaranteed that after execution of a loop with some other 967 // schedule kind all the parm3 variables will contain the same value. Even if 968 // all parm3 will be the same, it still exists a bad case like using 0 and 1 969 // rather than program life-time increment. So the dedicated variable is 970 // required. The 'static_steal_counter' is used. 971 if (pr->schedule == kmp_sch_static_steal) { 972 // Other threads will inspect this variable when searching for a victim. 973 // This is a flag showing that other threads may steal from this thread 974 // since then. 975 volatile T *p = &pr->u.p.static_steal_counter; 976 *p = *p + 1; 977 } 978 #endif // ( KMP_STATIC_STEAL_ENABLED ) 979 980 #if OMPT_SUPPORT && OMPT_OPTIONAL 981 if (ompt_enabled.ompt_callback_work) { 982 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 983 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 984 ompt_callbacks.ompt_callback(ompt_callback_work)( 985 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 986 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 987 } 988 #endif 989 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 990 } 991 992 /* For ordered loops, either __kmp_dispatch_finish() should be called after 993 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 994 * every chunk of iterations. If the ordered section(s) were not executed 995 * for this iteration (or every iteration in this chunk), we need to set the 996 * ordered iteration counters so that the next thread can proceed. */ 997 template <typename UT> 998 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 999 typedef typename traits_t<UT>::signed_t ST; 1000 kmp_info_t *th = __kmp_threads[gtid]; 1001 1002 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1003 if (!th->th.th_team->t.t_serialized) { 1004 1005 dispatch_private_info_template<UT> *pr = 1006 reinterpret_cast<dispatch_private_info_template<UT> *>( 1007 th->th.th_dispatch->th_dispatch_pr_current); 1008 dispatch_shared_info_template<UT> volatile *sh = 1009 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1010 th->th.th_dispatch->th_dispatch_sh_current); 1011 KMP_DEBUG_ASSERT(pr); 1012 KMP_DEBUG_ASSERT(sh); 1013 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1014 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1015 1016 if (pr->ordered_bumped) { 1017 KD_TRACE( 1018 1000, 1019 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1020 gtid)); 1021 pr->ordered_bumped = 0; 1022 } else { 1023 UT lower = pr->u.p.ordered_lower; 1024 1025 #ifdef KMP_DEBUG 1026 { 1027 char *buff; 1028 // create format specifiers before the debug output 1029 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1030 "ordered_iteration:%%%s lower:%%%s\n", 1031 traits_t<UT>::spec, traits_t<UT>::spec); 1032 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1033 __kmp_str_free(&buff); 1034 } 1035 #endif 1036 1037 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1038 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1039 KMP_MB(); /* is this necessary? */ 1040 #ifdef KMP_DEBUG 1041 { 1042 char *buff; 1043 // create format specifiers before the debug output 1044 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1045 "ordered_iteration:%%%s lower:%%%s\n", 1046 traits_t<UT>::spec, traits_t<UT>::spec); 1047 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1048 __kmp_str_free(&buff); 1049 } 1050 #endif 1051 1052 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1053 } // if 1054 } // if 1055 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1056 } 1057 1058 #ifdef KMP_GOMP_COMPAT 1059 1060 template <typename UT> 1061 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1062 typedef typename traits_t<UT>::signed_t ST; 1063 kmp_info_t *th = __kmp_threads[gtid]; 1064 1065 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1066 if (!th->th.th_team->t.t_serialized) { 1067 // int cid; 1068 dispatch_private_info_template<UT> *pr = 1069 reinterpret_cast<dispatch_private_info_template<UT> *>( 1070 th->th.th_dispatch->th_dispatch_pr_current); 1071 dispatch_shared_info_template<UT> volatile *sh = 1072 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1073 th->th.th_dispatch->th_dispatch_sh_current); 1074 KMP_DEBUG_ASSERT(pr); 1075 KMP_DEBUG_ASSERT(sh); 1076 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1077 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1078 1079 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1080 UT lower = pr->u.p.ordered_lower; 1081 UT upper = pr->u.p.ordered_upper; 1082 UT inc = upper - lower + 1; 1083 1084 if (pr->ordered_bumped == inc) { 1085 KD_TRACE( 1086 1000, 1087 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1088 gtid)); 1089 pr->ordered_bumped = 0; 1090 } else { 1091 inc -= pr->ordered_bumped; 1092 1093 #ifdef KMP_DEBUG 1094 { 1095 char *buff; 1096 // create format specifiers before the debug output 1097 buff = __kmp_str_format( 1098 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1099 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1100 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1101 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1102 __kmp_str_free(&buff); 1103 } 1104 #endif 1105 1106 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1107 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1108 1109 KMP_MB(); /* is this necessary? */ 1110 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1111 "ordered_bumped to zero\n", 1112 gtid)); 1113 pr->ordered_bumped = 0; 1114 //!!!!! TODO check if the inc should be unsigned, or signed??? 1115 #ifdef KMP_DEBUG 1116 { 1117 char *buff; 1118 // create format specifiers before the debug output 1119 buff = __kmp_str_format( 1120 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1121 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1122 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1123 traits_t<UT>::spec); 1124 KD_TRACE(1000, 1125 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1126 __kmp_str_free(&buff); 1127 } 1128 #endif 1129 1130 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1131 } 1132 // } 1133 } 1134 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1135 } 1136 1137 #endif /* KMP_GOMP_COMPAT */ 1138 1139 template <typename T> 1140 int __kmp_dispatch_next_algorithm(int gtid, 1141 dispatch_private_info_template<T> *pr, 1142 dispatch_shared_info_template<T> volatile *sh, 1143 kmp_int32 *p_last, T *p_lb, T *p_ub, 1144 typename traits_t<T>::signed_t *p_st, T nproc, 1145 T tid) { 1146 typedef typename traits_t<T>::unsigned_t UT; 1147 typedef typename traits_t<T>::signed_t ST; 1148 typedef typename traits_t<T>::floating_t DBL; 1149 int status = 0; 1150 kmp_int32 last = 0; 1151 T start; 1152 ST incr; 1153 UT limit, trip, init; 1154 kmp_info_t *th = __kmp_threads[gtid]; 1155 kmp_team_t *team = th->th.th_team; 1156 1157 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1158 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1159 KMP_DEBUG_ASSERT(pr); 1160 KMP_DEBUG_ASSERT(sh); 1161 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1162 #ifdef KMP_DEBUG 1163 { 1164 char *buff; 1165 // create format specifiers before the debug output 1166 buff = 1167 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1168 "sh:%%p nproc:%%%s tid:%%%s\n", 1169 traits_t<T>::spec, traits_t<T>::spec); 1170 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1171 __kmp_str_free(&buff); 1172 } 1173 #endif 1174 1175 // zero trip count 1176 if (pr->u.p.tc == 0) { 1177 KD_TRACE(10, 1178 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1179 "zero status:%d\n", 1180 gtid, status)); 1181 return 0; 1182 } 1183 1184 switch (pr->schedule) { 1185 #if (KMP_STATIC_STEAL_ENABLED) 1186 case kmp_sch_static_steal: { 1187 T chunk = pr->u.p.parm1; 1188 1189 KD_TRACE(100, 1190 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1191 gtid)); 1192 1193 trip = pr->u.p.tc - 1; 1194 1195 if (traits_t<T>::type_size > 4) { 1196 // use lock for 8-byte and CAS for 4-byte induction 1197 // variable. TODO (optional): check and use 16-byte CAS 1198 kmp_lock_t *lck = pr->u.p.th_steal_lock; 1199 KMP_DEBUG_ASSERT(lck != NULL); 1200 if (pr->u.p.count < (UT)pr->u.p.ub) { 1201 __kmp_acquire_lock(lck, gtid); 1202 // try to get own chunk of iterations 1203 init = (pr->u.p.count)++; 1204 status = (init < (UT)pr->u.p.ub); 1205 __kmp_release_lock(lck, gtid); 1206 } else { 1207 status = 0; // no own chunks 1208 } 1209 if (!status) { // try to steal 1210 kmp_info_t **other_threads = team->t.t_threads; 1211 int while_limit = pr->u.p.parm3; 1212 int while_index = 0; 1213 T id = pr->u.p.static_steal_counter; // loop id 1214 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1215 __kmp_dispatch_num_buffers; // current loop index 1216 // note: victim thread can potentially execute another loop 1217 // TODO: algorithm of searching for a victim 1218 // should be cleaned up and measured 1219 while ((!status) && (while_limit != ++while_index)) { 1220 dispatch_private_info_template<T> *victim; 1221 T remaining; 1222 T victimIdx = pr->u.p.parm4; 1223 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1224 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1225 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1226 KMP_DEBUG_ASSERT(victim); 1227 while ((victim == pr || id != victim->u.p.static_steal_counter) && 1228 oldVictimIdx != victimIdx) { 1229 victimIdx = (victimIdx + 1) % nproc; 1230 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1231 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1232 KMP_DEBUG_ASSERT(victim); 1233 } 1234 if (victim == pr || id != victim->u.p.static_steal_counter) { 1235 continue; // try once more (nproc attempts in total) 1236 // no victim is ready yet to participate in stealing 1237 // because no victim passed kmp_init_dispatch yet 1238 } 1239 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1240 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1241 continue; // not enough chunks to steal, goto next victim 1242 } 1243 1244 lck = victim->u.p.th_steal_lock; 1245 KMP_ASSERT(lck != NULL); 1246 __kmp_acquire_lock(lck, gtid); 1247 limit = victim->u.p.ub; // keep initial ub 1248 if (victim->u.p.count >= limit || 1249 (remaining = limit - victim->u.p.count) < 2) { 1250 __kmp_release_lock(lck, gtid); 1251 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1252 continue; // not enough chunks to steal 1253 } 1254 // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or 1255 // by 1 1256 if (remaining > 3) { 1257 // steal 1/4 of remaining 1258 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1259 init = (victim->u.p.ub -= (remaining >> 2)); 1260 } else { 1261 // steal 1 chunk of 2 or 3 remaining 1262 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1263 init = (victim->u.p.ub -= 1); 1264 } 1265 __kmp_release_lock(lck, gtid); 1266 1267 KMP_DEBUG_ASSERT(init + 1 <= limit); 1268 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1269 status = 1; 1270 while_index = 0; 1271 // now update own count and ub with stolen range but init chunk 1272 __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid); 1273 pr->u.p.count = init + 1; 1274 pr->u.p.ub = limit; 1275 __kmp_release_lock(pr->u.p.th_steal_lock, gtid); 1276 } // while (search for victim) 1277 } // if (try to find victim and steal) 1278 } else { 1279 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1280 typedef union { 1281 struct { 1282 UT count; 1283 T ub; 1284 } p; 1285 kmp_int64 b; 1286 } union_i4; 1287 // All operations on 'count' or 'ub' must be combined atomically 1288 // together. 1289 { 1290 union_i4 vold, vnew; 1291 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1292 vnew = vold; 1293 vnew.p.count++; 1294 while (!KMP_COMPARE_AND_STORE_ACQ64( 1295 (volatile kmp_int64 *)&pr->u.p.count, 1296 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1297 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1298 KMP_CPU_PAUSE(); 1299 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1300 vnew = vold; 1301 vnew.p.count++; 1302 } 1303 vnew = vold; 1304 init = vnew.p.count; 1305 status = (init < (UT)vnew.p.ub); 1306 } 1307 1308 if (!status) { 1309 kmp_info_t **other_threads = team->t.t_threads; 1310 int while_limit = pr->u.p.parm3; 1311 int while_index = 0; 1312 T id = pr->u.p.static_steal_counter; // loop id 1313 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1314 __kmp_dispatch_num_buffers; // current loop index 1315 // note: victim thread can potentially execute another loop 1316 // TODO: algorithm of searching for a victim 1317 // should be cleaned up and measured 1318 while ((!status) && (while_limit != ++while_index)) { 1319 dispatch_private_info_template<T> *victim; 1320 union_i4 vold, vnew; 1321 kmp_int32 remaining; 1322 T victimIdx = pr->u.p.parm4; 1323 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1324 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1325 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1326 KMP_DEBUG_ASSERT(victim); 1327 while ((victim == pr || id != victim->u.p.static_steal_counter) && 1328 oldVictimIdx != victimIdx) { 1329 victimIdx = (victimIdx + 1) % nproc; 1330 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1331 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1332 KMP_DEBUG_ASSERT(victim); 1333 } 1334 if (victim == pr || id != victim->u.p.static_steal_counter) { 1335 continue; // try once more (nproc attempts in total) 1336 // no victim is ready yet to participate in stealing 1337 // because no victim passed kmp_init_dispatch yet 1338 } 1339 pr->u.p.parm4 = victimIdx; // new victim found 1340 while (1) { // CAS loop if victim has enough chunks to steal 1341 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1342 vnew = vold; 1343 1344 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1345 if (vnew.p.count >= (UT)vnew.p.ub || 1346 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1347 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1348 break; // not enough chunks to steal, goto next victim 1349 } 1350 if (remaining > 3) { 1351 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining 1352 } else { 1353 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1354 } 1355 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1356 // TODO: Should this be acquire or release? 1357 if (KMP_COMPARE_AND_STORE_ACQ64( 1358 (volatile kmp_int64 *)&victim->u.p.count, 1359 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1360 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1361 // stealing succeeded 1362 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1363 vold.p.ub - vnew.p.ub); 1364 status = 1; 1365 while_index = 0; 1366 // now update own count and ub 1367 init = vnew.p.ub; 1368 vold.p.count = init + 1; 1369 #if KMP_ARCH_X86 1370 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1371 #else 1372 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1373 #endif 1374 break; 1375 } // if (check CAS result) 1376 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt 1377 } // while (try to steal from particular victim) 1378 } // while (search for victim) 1379 } // if (try to find victim and steal) 1380 } // if (4-byte induction variable) 1381 if (!status) { 1382 *p_lb = 0; 1383 *p_ub = 0; 1384 if (p_st != NULL) 1385 *p_st = 0; 1386 } else { 1387 start = pr->u.p.parm2; 1388 init *= chunk; 1389 limit = chunk + init - 1; 1390 incr = pr->u.p.st; 1391 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1392 1393 KMP_DEBUG_ASSERT(init <= trip); 1394 if ((last = (limit >= trip)) != 0) 1395 limit = trip; 1396 if (p_st != NULL) 1397 *p_st = incr; 1398 1399 if (incr == 1) { 1400 *p_lb = start + init; 1401 *p_ub = start + limit; 1402 } else { 1403 *p_lb = start + init * incr; 1404 *p_ub = start + limit * incr; 1405 } 1406 1407 if (pr->flags.ordered) { 1408 pr->u.p.ordered_lower = init; 1409 pr->u.p.ordered_upper = limit; 1410 } // if 1411 } // if 1412 break; 1413 } // case 1414 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1415 case kmp_sch_static_balanced: { 1416 KD_TRACE( 1417 10, 1418 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1419 gtid)); 1420 /* check if thread has any iteration to do */ 1421 if ((status = !pr->u.p.count) != 0) { 1422 pr->u.p.count = 1; 1423 *p_lb = pr->u.p.lb; 1424 *p_ub = pr->u.p.ub; 1425 last = pr->u.p.parm1; 1426 if (p_st != NULL) 1427 *p_st = pr->u.p.st; 1428 } else { /* no iterations to do */ 1429 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1430 } 1431 } // case 1432 break; 1433 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1434 merged here */ 1435 case kmp_sch_static_chunked: { 1436 T parm1; 1437 1438 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1439 "kmp_sch_static_[affinity|chunked] case\n", 1440 gtid)); 1441 parm1 = pr->u.p.parm1; 1442 1443 trip = pr->u.p.tc - 1; 1444 init = parm1 * (pr->u.p.count + tid); 1445 1446 if ((status = (init <= trip)) != 0) { 1447 start = pr->u.p.lb; 1448 incr = pr->u.p.st; 1449 limit = parm1 + init - 1; 1450 1451 if ((last = (limit >= trip)) != 0) 1452 limit = trip; 1453 1454 if (p_st != NULL) 1455 *p_st = incr; 1456 1457 pr->u.p.count += nproc; 1458 1459 if (incr == 1) { 1460 *p_lb = start + init; 1461 *p_ub = start + limit; 1462 } else { 1463 *p_lb = start + init * incr; 1464 *p_ub = start + limit * incr; 1465 } 1466 1467 if (pr->flags.ordered) { 1468 pr->u.p.ordered_lower = init; 1469 pr->u.p.ordered_upper = limit; 1470 } // if 1471 } // if 1472 } // case 1473 break; 1474 1475 case kmp_sch_dynamic_chunked: { 1476 T chunk = pr->u.p.parm1; 1477 1478 KD_TRACE( 1479 100, 1480 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1481 gtid)); 1482 1483 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1484 trip = pr->u.p.tc - 1; 1485 1486 if ((status = (init <= trip)) == 0) { 1487 *p_lb = 0; 1488 *p_ub = 0; 1489 if (p_st != NULL) 1490 *p_st = 0; 1491 } else { 1492 start = pr->u.p.lb; 1493 limit = chunk + init - 1; 1494 incr = pr->u.p.st; 1495 1496 if ((last = (limit >= trip)) != 0) 1497 limit = trip; 1498 1499 if (p_st != NULL) 1500 *p_st = incr; 1501 1502 if (incr == 1) { 1503 *p_lb = start + init; 1504 *p_ub = start + limit; 1505 } else { 1506 *p_lb = start + init * incr; 1507 *p_ub = start + limit * incr; 1508 } 1509 1510 if (pr->flags.ordered) { 1511 pr->u.p.ordered_lower = init; 1512 pr->u.p.ordered_upper = limit; 1513 } // if 1514 } // if 1515 } // case 1516 break; 1517 1518 case kmp_sch_guided_iterative_chunked: { 1519 T chunkspec = pr->u.p.parm1; 1520 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1521 "iterative case\n", 1522 gtid)); 1523 trip = pr->u.p.tc; 1524 // Start atomic part of calculations 1525 while (1) { 1526 ST remaining; // signed, because can be < 0 1527 init = sh->u.s.iteration; // shared value 1528 remaining = trip - init; 1529 if (remaining <= 0) { // AC: need to compare with 0 first 1530 // nothing to do, don't try atomic op 1531 status = 0; 1532 break; 1533 } 1534 if ((T)remaining < 1535 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1536 // use dynamic-style schedule 1537 // atomically increment iterations, get old value 1538 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1539 (ST)chunkspec); 1540 remaining = trip - init; 1541 if (remaining <= 0) { 1542 status = 0; // all iterations got by other threads 1543 } else { 1544 // got some iterations to work on 1545 status = 1; 1546 if ((T)remaining > chunkspec) { 1547 limit = init + chunkspec - 1; 1548 } else { 1549 last = 1; // the last chunk 1550 limit = init + remaining - 1; 1551 } // if 1552 } // if 1553 break; 1554 } // if 1555 limit = init + 1556 (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc 1557 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1558 (ST)init, (ST)limit)) { 1559 // CAS was successful, chunk obtained 1560 status = 1; 1561 --limit; 1562 break; 1563 } // if 1564 } // while 1565 if (status != 0) { 1566 start = pr->u.p.lb; 1567 incr = pr->u.p.st; 1568 if (p_st != NULL) 1569 *p_st = incr; 1570 *p_lb = start + init * incr; 1571 *p_ub = start + limit * incr; 1572 if (pr->flags.ordered) { 1573 pr->u.p.ordered_lower = init; 1574 pr->u.p.ordered_upper = limit; 1575 } // if 1576 } else { 1577 *p_lb = 0; 1578 *p_ub = 0; 1579 if (p_st != NULL) 1580 *p_st = 0; 1581 } // if 1582 } // case 1583 break; 1584 1585 case kmp_sch_guided_simd: { 1586 // same as iterative but curr-chunk adjusted to be multiple of given 1587 // chunk 1588 T chunk = pr->u.p.parm1; 1589 KD_TRACE(100, 1590 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1591 gtid)); 1592 trip = pr->u.p.tc; 1593 // Start atomic part of calculations 1594 while (1) { 1595 ST remaining; // signed, because can be < 0 1596 init = sh->u.s.iteration; // shared value 1597 remaining = trip - init; 1598 if (remaining <= 0) { // AC: need to compare with 0 first 1599 status = 0; // nothing to do, don't try atomic op 1600 break; 1601 } 1602 KMP_DEBUG_ASSERT(init % chunk == 0); 1603 // compare with K*nproc*(chunk+1), K=2 by default 1604 if ((T)remaining < pr->u.p.parm2) { 1605 // use dynamic-style schedule 1606 // atomically increment iterations, get old value 1607 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1608 (ST)chunk); 1609 remaining = trip - init; 1610 if (remaining <= 0) { 1611 status = 0; // all iterations got by other threads 1612 } else { 1613 // got some iterations to work on 1614 status = 1; 1615 if ((T)remaining > chunk) { 1616 limit = init + chunk - 1; 1617 } else { 1618 last = 1; // the last chunk 1619 limit = init + remaining - 1; 1620 } // if 1621 } // if 1622 break; 1623 } // if 1624 // divide by K*nproc 1625 UT span = remaining * (*(double *)&pr->u.p.parm3); 1626 UT rem = span % chunk; 1627 if (rem) // adjust so that span%chunk == 0 1628 span += chunk - rem; 1629 limit = init + span; 1630 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1631 (ST)init, (ST)limit)) { 1632 // CAS was successful, chunk obtained 1633 status = 1; 1634 --limit; 1635 break; 1636 } // if 1637 } // while 1638 if (status != 0) { 1639 start = pr->u.p.lb; 1640 incr = pr->u.p.st; 1641 if (p_st != NULL) 1642 *p_st = incr; 1643 *p_lb = start + init * incr; 1644 *p_ub = start + limit * incr; 1645 if (pr->flags.ordered) { 1646 pr->u.p.ordered_lower = init; 1647 pr->u.p.ordered_upper = limit; 1648 } // if 1649 } else { 1650 *p_lb = 0; 1651 *p_ub = 0; 1652 if (p_st != NULL) 1653 *p_st = 0; 1654 } // if 1655 } // case 1656 break; 1657 1658 case kmp_sch_guided_analytical_chunked: { 1659 T chunkspec = pr->u.p.parm1; 1660 UT chunkIdx; 1661 #if KMP_USE_X87CONTROL 1662 /* for storing original FPCW value for Windows* OS on 1663 IA-32 architecture 8-byte version */ 1664 unsigned int oldFpcw; 1665 unsigned int fpcwSet = 0; 1666 #endif 1667 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1668 "kmp_sch_guided_analytical_chunked case\n", 1669 gtid)); 1670 1671 trip = pr->u.p.tc; 1672 1673 KMP_DEBUG_ASSERT(nproc > 1); 1674 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1675 1676 while (1) { /* this while loop is a safeguard against unexpected zero 1677 chunk sizes */ 1678 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1679 if (chunkIdx >= (UT)pr->u.p.parm2) { 1680 --trip; 1681 /* use dynamic-style scheduling */ 1682 init = chunkIdx * chunkspec + pr->u.p.count; 1683 /* need to verify init > 0 in case of overflow in the above 1684 * calculation */ 1685 if ((status = (init > 0 && init <= trip)) != 0) { 1686 limit = init + chunkspec - 1; 1687 1688 if ((last = (limit >= trip)) != 0) 1689 limit = trip; 1690 } 1691 break; 1692 } else { 1693 /* use exponential-style scheduling */ 1694 /* The following check is to workaround the lack of long double precision on 1695 Windows* OS. 1696 This check works around the possible effect that init != 0 for chunkIdx == 0. 1697 */ 1698 #if KMP_USE_X87CONTROL 1699 /* If we haven't already done so, save original 1700 FPCW and set precision to 64-bit, as Windows* OS 1701 on IA-32 architecture defaults to 53-bit */ 1702 if (!fpcwSet) { 1703 oldFpcw = _control87(0, 0); 1704 _control87(_PC_64, _MCW_PC); 1705 fpcwSet = 0x30000; 1706 } 1707 #endif 1708 if (chunkIdx) { 1709 init = __kmp_dispatch_guided_remaining<T>( 1710 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1711 KMP_DEBUG_ASSERT(init); 1712 init = trip - init; 1713 } else 1714 init = 0; 1715 limit = trip - __kmp_dispatch_guided_remaining<T>( 1716 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1717 KMP_ASSERT(init <= limit); 1718 if (init < limit) { 1719 KMP_DEBUG_ASSERT(limit <= trip); 1720 --limit; 1721 status = 1; 1722 break; 1723 } // if 1724 } // if 1725 } // while (1) 1726 #if KMP_USE_X87CONTROL 1727 /* restore FPCW if necessary 1728 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1729 */ 1730 if (fpcwSet && (oldFpcw & fpcwSet)) 1731 _control87(oldFpcw, _MCW_PC); 1732 #endif 1733 if (status != 0) { 1734 start = pr->u.p.lb; 1735 incr = pr->u.p.st; 1736 if (p_st != NULL) 1737 *p_st = incr; 1738 *p_lb = start + init * incr; 1739 *p_ub = start + limit * incr; 1740 if (pr->flags.ordered) { 1741 pr->u.p.ordered_lower = init; 1742 pr->u.p.ordered_upper = limit; 1743 } 1744 } else { 1745 *p_lb = 0; 1746 *p_ub = 0; 1747 if (p_st != NULL) 1748 *p_st = 0; 1749 } 1750 } // case 1751 break; 1752 1753 case kmp_sch_trapezoidal: { 1754 UT index; 1755 T parm2 = pr->u.p.parm2; 1756 T parm3 = pr->u.p.parm3; 1757 T parm4 = pr->u.p.parm4; 1758 KD_TRACE(100, 1759 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1760 gtid)); 1761 1762 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1763 1764 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1765 trip = pr->u.p.tc - 1; 1766 1767 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1768 *p_lb = 0; 1769 *p_ub = 0; 1770 if (p_st != NULL) 1771 *p_st = 0; 1772 } else { 1773 start = pr->u.p.lb; 1774 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1775 incr = pr->u.p.st; 1776 1777 if ((last = (limit >= trip)) != 0) 1778 limit = trip; 1779 1780 if (p_st != NULL) 1781 *p_st = incr; 1782 1783 if (incr == 1) { 1784 *p_lb = start + init; 1785 *p_ub = start + limit; 1786 } else { 1787 *p_lb = start + init * incr; 1788 *p_ub = start + limit * incr; 1789 } 1790 1791 if (pr->flags.ordered) { 1792 pr->u.p.ordered_lower = init; 1793 pr->u.p.ordered_upper = limit; 1794 } // if 1795 } // if 1796 } // case 1797 break; 1798 default: { 1799 status = 0; // to avoid complaints on uninitialized variable use 1800 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1801 KMP_HNT(GetNewerLibrary), // Hint 1802 __kmp_msg_null // Variadic argument list terminator 1803 ); 1804 } break; 1805 } // switch 1806 if (p_last) 1807 *p_last = last; 1808 #ifdef KMP_DEBUG 1809 if (pr->flags.ordered) { 1810 char *buff; 1811 // create format specifiers before the debug output 1812 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1813 "ordered_lower:%%%s ordered_upper:%%%s\n", 1814 traits_t<UT>::spec, traits_t<UT>::spec); 1815 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1816 __kmp_str_free(&buff); 1817 } 1818 { 1819 char *buff; 1820 // create format specifiers before the debug output 1821 buff = __kmp_str_format( 1822 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1823 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1824 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1825 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1826 __kmp_str_free(&buff); 1827 } 1828 #endif 1829 return status; 1830 } 1831 1832 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1833 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1834 is not called. */ 1835 #if OMPT_SUPPORT && OMPT_OPTIONAL 1836 #define OMPT_LOOP_END \ 1837 if (status == 0) { \ 1838 if (ompt_enabled.ompt_callback_work) { \ 1839 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1840 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1841 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1842 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1843 &(task_info->task_data), 0, codeptr); \ 1844 } \ 1845 } 1846 // TODO: implement count 1847 #else 1848 #define OMPT_LOOP_END // no-op 1849 #endif 1850 1851 #if KMP_STATS_ENABLED 1852 #define KMP_STATS_LOOP_END \ 1853 { \ 1854 kmp_int64 u, l, t, i; \ 1855 l = (kmp_int64)(*p_lb); \ 1856 u = (kmp_int64)(*p_ub); \ 1857 i = (kmp_int64)(pr->u.p.st); \ 1858 if (status == 0) { \ 1859 t = 0; \ 1860 KMP_POP_PARTITIONED_TIMER(); \ 1861 } else if (i == 1) { \ 1862 if (u >= l) \ 1863 t = u - l + 1; \ 1864 else \ 1865 t = 0; \ 1866 } else if (i < 0) { \ 1867 if (l >= u) \ 1868 t = (l - u) / (-i) + 1; \ 1869 else \ 1870 t = 0; \ 1871 } else { \ 1872 if (u >= l) \ 1873 t = (u - l) / i + 1; \ 1874 else \ 1875 t = 0; \ 1876 } \ 1877 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1878 } 1879 #else 1880 #define KMP_STATS_LOOP_END /* Nothing */ 1881 #endif 1882 1883 template <typename T> 1884 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1885 T *p_lb, T *p_ub, 1886 typename traits_t<T>::signed_t *p_st 1887 #if OMPT_SUPPORT && OMPT_OPTIONAL 1888 , 1889 void *codeptr 1890 #endif 1891 ) { 1892 1893 typedef typename traits_t<T>::unsigned_t UT; 1894 typedef typename traits_t<T>::signed_t ST; 1895 // This is potentially slightly misleading, schedule(runtime) will appear here 1896 // even if the actual runtime schedule is static. (Which points out a 1897 // disadvantage of schedule(runtime): even when static scheduling is used it 1898 // costs more than a compile time choice to use static scheduling would.) 1899 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 1900 1901 int status; 1902 dispatch_private_info_template<T> *pr; 1903 kmp_info_t *th = __kmp_threads[gtid]; 1904 kmp_team_t *team = th->th.th_team; 1905 1906 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1907 KD_TRACE( 1908 1000, 1909 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 1910 gtid, p_lb, p_ub, p_st, p_last)); 1911 1912 if (team->t.t_serialized) { 1913 /* NOTE: serialize this dispatch because we are not at the active level */ 1914 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1915 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1916 KMP_DEBUG_ASSERT(pr); 1917 1918 if ((status = (pr->u.p.tc != 0)) == 0) { 1919 *p_lb = 0; 1920 *p_ub = 0; 1921 // if ( p_last != NULL ) 1922 // *p_last = 0; 1923 if (p_st != NULL) 1924 *p_st = 0; 1925 if (__kmp_env_consistency_check) { 1926 if (pr->pushed_ws != ct_none) { 1927 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1928 } 1929 } 1930 } else if (pr->flags.nomerge) { 1931 kmp_int32 last; 1932 T start; 1933 UT limit, trip, init; 1934 ST incr; 1935 T chunk = pr->u.p.parm1; 1936 1937 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1938 gtid)); 1939 1940 init = chunk * pr->u.p.count++; 1941 trip = pr->u.p.tc - 1; 1942 1943 if ((status = (init <= trip)) == 0) { 1944 *p_lb = 0; 1945 *p_ub = 0; 1946 // if ( p_last != NULL ) 1947 // *p_last = 0; 1948 if (p_st != NULL) 1949 *p_st = 0; 1950 if (__kmp_env_consistency_check) { 1951 if (pr->pushed_ws != ct_none) { 1952 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1953 } 1954 } 1955 } else { 1956 start = pr->u.p.lb; 1957 limit = chunk + init - 1; 1958 incr = pr->u.p.st; 1959 1960 if ((last = (limit >= trip)) != 0) { 1961 limit = trip; 1962 #if KMP_OS_WINDOWS 1963 pr->u.p.last_upper = pr->u.p.ub; 1964 #endif /* KMP_OS_WINDOWS */ 1965 } 1966 if (p_last != NULL) 1967 *p_last = last; 1968 if (p_st != NULL) 1969 *p_st = incr; 1970 if (incr == 1) { 1971 *p_lb = start + init; 1972 *p_ub = start + limit; 1973 } else { 1974 *p_lb = start + init * incr; 1975 *p_ub = start + limit * incr; 1976 } 1977 1978 if (pr->flags.ordered) { 1979 pr->u.p.ordered_lower = init; 1980 pr->u.p.ordered_upper = limit; 1981 #ifdef KMP_DEBUG 1982 { 1983 char *buff; 1984 // create format specifiers before the debug output 1985 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1986 "ordered_lower:%%%s ordered_upper:%%%s\n", 1987 traits_t<UT>::spec, traits_t<UT>::spec); 1988 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1989 pr->u.p.ordered_upper)); 1990 __kmp_str_free(&buff); 1991 } 1992 #endif 1993 } // if 1994 } // if 1995 } else { 1996 pr->u.p.tc = 0; 1997 *p_lb = pr->u.p.lb; 1998 *p_ub = pr->u.p.ub; 1999 #if KMP_OS_WINDOWS 2000 pr->u.p.last_upper = *p_ub; 2001 #endif /* KMP_OS_WINDOWS */ 2002 if (p_last != NULL) 2003 *p_last = TRUE; 2004 if (p_st != NULL) 2005 *p_st = pr->u.p.st; 2006 } // if 2007 #ifdef KMP_DEBUG 2008 { 2009 char *buff; 2010 // create format specifiers before the debug output 2011 buff = __kmp_str_format( 2012 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 2013 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 2014 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2015 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status)); 2016 __kmp_str_free(&buff); 2017 } 2018 #endif 2019 #if INCLUDE_SSC_MARKS 2020 SSC_MARK_DISPATCH_NEXT(); 2021 #endif 2022 OMPT_LOOP_END; 2023 KMP_STATS_LOOP_END; 2024 return status; 2025 } else { 2026 kmp_int32 last = 0; 2027 dispatch_shared_info_template<T> volatile *sh; 2028 2029 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2030 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2031 2032 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2033 th->th.th_dispatch->th_dispatch_pr_current); 2034 KMP_DEBUG_ASSERT(pr); 2035 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2036 th->th.th_dispatch->th_dispatch_sh_current); 2037 KMP_DEBUG_ASSERT(sh); 2038 2039 #if KMP_USE_HIER_SCHED 2040 if (pr->flags.use_hier) 2041 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2042 else 2043 #endif // KMP_USE_HIER_SCHED 2044 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2045 p_st, th->th.th_team_nproc, 2046 th->th.th_info.ds.ds_tid); 2047 // status == 0: no more iterations to execute 2048 if (status == 0) { 2049 UT num_done; 2050 2051 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2052 #ifdef KMP_DEBUG 2053 { 2054 char *buff; 2055 // create format specifiers before the debug output 2056 buff = __kmp_str_format( 2057 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2058 traits_t<UT>::spec); 2059 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2060 __kmp_str_free(&buff); 2061 } 2062 #endif 2063 2064 #if KMP_USE_HIER_SCHED 2065 pr->flags.use_hier = FALSE; 2066 #endif 2067 if ((ST)num_done == th->th.th_team_nproc - 1) { 2068 #if (KMP_STATIC_STEAL_ENABLED) 2069 if (pr->schedule == kmp_sch_static_steal && 2070 traits_t<T>::type_size > 4) { 2071 int i; 2072 int idx = (th->th.th_dispatch->th_disp_index - 1) % 2073 __kmp_dispatch_num_buffers; // current loop index 2074 kmp_info_t **other_threads = team->t.t_threads; 2075 // loop complete, safe to destroy locks used for stealing 2076 for (i = 0; i < th->th.th_team_nproc; ++i) { 2077 dispatch_private_info_template<T> *buf = 2078 reinterpret_cast<dispatch_private_info_template<T> *>( 2079 &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]); 2080 kmp_lock_t *lck = buf->u.p.th_steal_lock; 2081 KMP_ASSERT(lck != NULL); 2082 __kmp_destroy_lock(lck); 2083 __kmp_free(lck); 2084 buf->u.p.th_steal_lock = NULL; 2085 } 2086 } 2087 #endif 2088 /* NOTE: release this buffer to be reused */ 2089 2090 KMP_MB(); /* Flush all pending memory write invalidates. */ 2091 2092 sh->u.s.num_done = 0; 2093 sh->u.s.iteration = 0; 2094 2095 /* TODO replace with general release procedure? */ 2096 if (pr->flags.ordered) { 2097 sh->u.s.ordered_iteration = 0; 2098 } 2099 2100 KMP_MB(); /* Flush all pending memory write invalidates. */ 2101 2102 sh->buffer_index += __kmp_dispatch_num_buffers; 2103 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2104 gtid, sh->buffer_index)); 2105 2106 KMP_MB(); /* Flush all pending memory write invalidates. */ 2107 2108 } // if 2109 if (__kmp_env_consistency_check) { 2110 if (pr->pushed_ws != ct_none) { 2111 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2112 } 2113 } 2114 2115 th->th.th_dispatch->th_deo_fcn = NULL; 2116 th->th.th_dispatch->th_dxo_fcn = NULL; 2117 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2118 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2119 } // if (status == 0) 2120 #if KMP_OS_WINDOWS 2121 else if (last) { 2122 pr->u.p.last_upper = pr->u.p.ub; 2123 } 2124 #endif /* KMP_OS_WINDOWS */ 2125 if (p_last != NULL && status != 0) 2126 *p_last = last; 2127 } // if 2128 2129 #ifdef KMP_DEBUG 2130 { 2131 char *buff; 2132 // create format specifiers before the debug output 2133 buff = __kmp_str_format( 2134 "__kmp_dispatch_next: T#%%d normal case: " 2135 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2136 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2137 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2138 (p_last ? *p_last : 0), status)); 2139 __kmp_str_free(&buff); 2140 } 2141 #endif 2142 #if INCLUDE_SSC_MARKS 2143 SSC_MARK_DISPATCH_NEXT(); 2144 #endif 2145 OMPT_LOOP_END; 2146 KMP_STATS_LOOP_END; 2147 return status; 2148 } 2149 2150 template <typename T> 2151 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2152 kmp_int32 *plastiter, T *plower, T *pupper, 2153 typename traits_t<T>::signed_t incr) { 2154 typedef typename traits_t<T>::unsigned_t UT; 2155 kmp_uint32 team_id; 2156 kmp_uint32 nteams; 2157 UT trip_count; 2158 kmp_team_t *team; 2159 kmp_info_t *th; 2160 2161 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2162 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2163 #ifdef KMP_DEBUG 2164 typedef typename traits_t<T>::signed_t ST; 2165 { 2166 char *buff; 2167 // create format specifiers before the debug output 2168 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2169 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2170 traits_t<T>::spec, traits_t<T>::spec, 2171 traits_t<ST>::spec, traits_t<T>::spec); 2172 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2173 __kmp_str_free(&buff); 2174 } 2175 #endif 2176 2177 if (__kmp_env_consistency_check) { 2178 if (incr == 0) { 2179 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2180 loc); 2181 } 2182 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2183 // The loop is illegal. 2184 // Some zero-trip loops maintained by compiler, e.g.: 2185 // for(i=10;i<0;++i) // lower >= upper - run-time check 2186 // for(i=0;i>10;--i) // lower <= upper - run-time check 2187 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2188 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2189 // Compiler does not check the following illegal loops: 2190 // for(i=0;i<10;i+=incr) // where incr<0 2191 // for(i=10;i>0;i-=incr) // where incr<0 2192 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2193 } 2194 } 2195 th = __kmp_threads[gtid]; 2196 team = th->th.th_team; 2197 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2198 nteams = th->th.th_teams_size.nteams; 2199 team_id = team->t.t_master_tid; 2200 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2201 2202 // compute global trip count 2203 if (incr == 1) { 2204 trip_count = *pupper - *plower + 1; 2205 } else if (incr == -1) { 2206 trip_count = *plower - *pupper + 1; 2207 } else if (incr > 0) { 2208 // upper-lower can exceed the limit of signed type 2209 trip_count = (UT)(*pupper - *plower) / incr + 1; 2210 } else { 2211 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2212 } 2213 2214 if (trip_count <= nteams) { 2215 KMP_DEBUG_ASSERT( 2216 __kmp_static == kmp_sch_static_greedy || 2217 __kmp_static == 2218 kmp_sch_static_balanced); // Unknown static scheduling type. 2219 // only some teams get single iteration, others get nothing 2220 if (team_id < trip_count) { 2221 *pupper = *plower = *plower + team_id * incr; 2222 } else { 2223 *plower = *pupper + incr; // zero-trip loop 2224 } 2225 if (plastiter != NULL) 2226 *plastiter = (team_id == trip_count - 1); 2227 } else { 2228 if (__kmp_static == kmp_sch_static_balanced) { 2229 UT chunk = trip_count / nteams; 2230 UT extras = trip_count % nteams; 2231 *plower += 2232 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2233 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2234 if (plastiter != NULL) 2235 *plastiter = (team_id == nteams - 1); 2236 } else { 2237 T chunk_inc_count = 2238 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2239 T upper = *pupper; 2240 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2241 // Unknown static scheduling type. 2242 *plower += team_id * chunk_inc_count; 2243 *pupper = *plower + chunk_inc_count - incr; 2244 // Check/correct bounds if needed 2245 if (incr > 0) { 2246 if (*pupper < *plower) 2247 *pupper = traits_t<T>::max_value; 2248 if (plastiter != NULL) 2249 *plastiter = *plower <= upper && *pupper > upper - incr; 2250 if (*pupper > upper) 2251 *pupper = upper; // tracker C73258 2252 } else { 2253 if (*pupper > *plower) 2254 *pupper = traits_t<T>::min_value; 2255 if (plastiter != NULL) 2256 *plastiter = *plower >= upper && *pupper < upper - incr; 2257 if (*pupper < upper) 2258 *pupper = upper; // tracker C73258 2259 } 2260 } 2261 } 2262 } 2263 2264 //----------------------------------------------------------------------------- 2265 // Dispatch routines 2266 // Transfer call to template< type T > 2267 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2268 // T lb, T ub, ST st, ST chunk ) 2269 extern "C" { 2270 2271 /*! 2272 @ingroup WORK_SHARING 2273 @{ 2274 @param loc Source location 2275 @param gtid Global thread id 2276 @param schedule Schedule type 2277 @param lb Lower bound 2278 @param ub Upper bound 2279 @param st Step (or increment if you prefer) 2280 @param chunk The chunk size to block with 2281 2282 This function prepares the runtime to start a dynamically scheduled for loop, 2283 saving the loop arguments. 2284 These functions are all identical apart from the types of the arguments. 2285 */ 2286 2287 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2288 enum sched_type schedule, kmp_int32 lb, 2289 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2290 KMP_DEBUG_ASSERT(__kmp_init_serial); 2291 #if OMPT_SUPPORT && OMPT_OPTIONAL 2292 OMPT_STORE_RETURN_ADDRESS(gtid); 2293 #endif 2294 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2295 } 2296 /*! 2297 See @ref __kmpc_dispatch_init_4 2298 */ 2299 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2300 enum sched_type schedule, kmp_uint32 lb, 2301 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2302 KMP_DEBUG_ASSERT(__kmp_init_serial); 2303 #if OMPT_SUPPORT && OMPT_OPTIONAL 2304 OMPT_STORE_RETURN_ADDRESS(gtid); 2305 #endif 2306 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2307 } 2308 2309 /*! 2310 See @ref __kmpc_dispatch_init_4 2311 */ 2312 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2313 enum sched_type schedule, kmp_int64 lb, 2314 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2315 KMP_DEBUG_ASSERT(__kmp_init_serial); 2316 #if OMPT_SUPPORT && OMPT_OPTIONAL 2317 OMPT_STORE_RETURN_ADDRESS(gtid); 2318 #endif 2319 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2320 } 2321 2322 /*! 2323 See @ref __kmpc_dispatch_init_4 2324 */ 2325 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2326 enum sched_type schedule, kmp_uint64 lb, 2327 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2328 KMP_DEBUG_ASSERT(__kmp_init_serial); 2329 #if OMPT_SUPPORT && OMPT_OPTIONAL 2330 OMPT_STORE_RETURN_ADDRESS(gtid); 2331 #endif 2332 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2333 } 2334 2335 /*! 2336 See @ref __kmpc_dispatch_init_4 2337 2338 Difference from __kmpc_dispatch_init set of functions is these functions 2339 are called for composite distribute parallel for construct. Thus before 2340 regular iterations dispatching we need to calc per-team iteration space. 2341 2342 These functions are all identical apart from the types of the arguments. 2343 */ 2344 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2345 enum sched_type schedule, kmp_int32 *p_last, 2346 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2347 kmp_int32 chunk) { 2348 KMP_DEBUG_ASSERT(__kmp_init_serial); 2349 #if OMPT_SUPPORT && OMPT_OPTIONAL 2350 OMPT_STORE_RETURN_ADDRESS(gtid); 2351 #endif 2352 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2353 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2354 } 2355 2356 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2357 enum sched_type schedule, kmp_int32 *p_last, 2358 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2359 kmp_int32 chunk) { 2360 KMP_DEBUG_ASSERT(__kmp_init_serial); 2361 #if OMPT_SUPPORT && OMPT_OPTIONAL 2362 OMPT_STORE_RETURN_ADDRESS(gtid); 2363 #endif 2364 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2365 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2366 } 2367 2368 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2369 enum sched_type schedule, kmp_int32 *p_last, 2370 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2371 kmp_int64 chunk) { 2372 KMP_DEBUG_ASSERT(__kmp_init_serial); 2373 #if OMPT_SUPPORT && OMPT_OPTIONAL 2374 OMPT_STORE_RETURN_ADDRESS(gtid); 2375 #endif 2376 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2377 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2378 } 2379 2380 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2381 enum sched_type schedule, kmp_int32 *p_last, 2382 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2383 kmp_int64 chunk) { 2384 KMP_DEBUG_ASSERT(__kmp_init_serial); 2385 #if OMPT_SUPPORT && OMPT_OPTIONAL 2386 OMPT_STORE_RETURN_ADDRESS(gtid); 2387 #endif 2388 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2389 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2390 } 2391 2392 /*! 2393 @param loc Source code location 2394 @param gtid Global thread id 2395 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2396 otherwise 2397 @param p_lb Pointer to the lower bound for the next chunk of work 2398 @param p_ub Pointer to the upper bound for the next chunk of work 2399 @param p_st Pointer to the stride for the next chunk of work 2400 @return one if there is work to be done, zero otherwise 2401 2402 Get the next dynamically allocated chunk of work for this thread. 2403 If there is no more work, then the lb,ub and stride need not be modified. 2404 */ 2405 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2406 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2407 #if OMPT_SUPPORT && OMPT_OPTIONAL 2408 OMPT_STORE_RETURN_ADDRESS(gtid); 2409 #endif 2410 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2411 #if OMPT_SUPPORT && OMPT_OPTIONAL 2412 , 2413 OMPT_LOAD_RETURN_ADDRESS(gtid) 2414 #endif 2415 ); 2416 } 2417 2418 /*! 2419 See @ref __kmpc_dispatch_next_4 2420 */ 2421 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2422 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2423 kmp_int32 *p_st) { 2424 #if OMPT_SUPPORT && OMPT_OPTIONAL 2425 OMPT_STORE_RETURN_ADDRESS(gtid); 2426 #endif 2427 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2428 #if OMPT_SUPPORT && OMPT_OPTIONAL 2429 , 2430 OMPT_LOAD_RETURN_ADDRESS(gtid) 2431 #endif 2432 ); 2433 } 2434 2435 /*! 2436 See @ref __kmpc_dispatch_next_4 2437 */ 2438 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2439 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2440 #if OMPT_SUPPORT && OMPT_OPTIONAL 2441 OMPT_STORE_RETURN_ADDRESS(gtid); 2442 #endif 2443 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2444 #if OMPT_SUPPORT && OMPT_OPTIONAL 2445 , 2446 OMPT_LOAD_RETURN_ADDRESS(gtid) 2447 #endif 2448 ); 2449 } 2450 2451 /*! 2452 See @ref __kmpc_dispatch_next_4 2453 */ 2454 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2455 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2456 kmp_int64 *p_st) { 2457 #if OMPT_SUPPORT && OMPT_OPTIONAL 2458 OMPT_STORE_RETURN_ADDRESS(gtid); 2459 #endif 2460 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2461 #if OMPT_SUPPORT && OMPT_OPTIONAL 2462 , 2463 OMPT_LOAD_RETURN_ADDRESS(gtid) 2464 #endif 2465 ); 2466 } 2467 2468 /*! 2469 @param loc Source code location 2470 @param gtid Global thread id 2471 2472 Mark the end of a dynamic loop. 2473 */ 2474 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2475 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2476 } 2477 2478 /*! 2479 See @ref __kmpc_dispatch_fini_4 2480 */ 2481 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2482 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2483 } 2484 2485 /*! 2486 See @ref __kmpc_dispatch_fini_4 2487 */ 2488 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2489 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2490 } 2491 2492 /*! 2493 See @ref __kmpc_dispatch_fini_4 2494 */ 2495 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2496 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2497 } 2498 /*! @} */ 2499 2500 //----------------------------------------------------------------------------- 2501 // Non-template routines from kmp_dispatch.cpp used in other sources 2502 2503 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2504 return value == checker; 2505 } 2506 2507 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2508 return value != checker; 2509 } 2510 2511 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2512 return value < checker; 2513 } 2514 2515 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2516 return value >= checker; 2517 } 2518 2519 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2520 return value <= checker; 2521 } 2522 2523 kmp_uint32 2524 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2525 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2526 void *obj // Higher-level synchronization object, or NULL. 2527 ) { 2528 // note: we may not belong to a team at this point 2529 volatile kmp_uint32 *spin = spinner; 2530 kmp_uint32 check = checker; 2531 kmp_uint32 spins; 2532 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2533 kmp_uint32 r; 2534 2535 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2536 KMP_INIT_YIELD(spins); 2537 // main wait spin loop 2538 while (!f(r = TCR_4(*spin), check)) { 2539 KMP_FSYNC_SPIN_PREPARE(obj); 2540 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2541 split. It causes problems with infinite recursion because of exit lock */ 2542 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2543 __kmp_abort_thread(); */ 2544 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2545 } 2546 KMP_FSYNC_SPIN_ACQUIRED(obj); 2547 return r; 2548 } 2549 2550 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, 2551 kmp_uint32 (*pred)(void *, kmp_uint32), 2552 void *obj // Higher-level synchronization object, or NULL. 2553 ) { 2554 // note: we may not belong to a team at this point 2555 void *spin = spinner; 2556 kmp_uint32 check = checker; 2557 kmp_uint32 spins; 2558 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2559 2560 KMP_FSYNC_SPIN_INIT(obj, spin); 2561 KMP_INIT_YIELD(spins); 2562 // main wait spin loop 2563 while (!f(spin, check)) { 2564 KMP_FSYNC_SPIN_PREPARE(obj); 2565 /* if we have waited a bit, or are noversubscribed, yield */ 2566 /* pause is in the following code */ 2567 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2568 } 2569 KMP_FSYNC_SPIN_ACQUIRED(obj); 2570 } 2571 2572 } // extern "C" 2573 2574 #ifdef KMP_GOMP_COMPAT 2575 2576 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2577 enum sched_type schedule, kmp_int32 lb, 2578 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2579 int push_ws) { 2580 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2581 push_ws); 2582 } 2583 2584 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2585 enum sched_type schedule, kmp_uint32 lb, 2586 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2587 int push_ws) { 2588 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2589 push_ws); 2590 } 2591 2592 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2593 enum sched_type schedule, kmp_int64 lb, 2594 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2595 int push_ws) { 2596 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2597 push_ws); 2598 } 2599 2600 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2601 enum sched_type schedule, kmp_uint64 lb, 2602 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2603 int push_ws) { 2604 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2605 push_ws); 2606 } 2607 2608 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2609 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2610 } 2611 2612 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2613 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2614 } 2615 2616 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2617 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2618 } 2619 2620 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2621 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2622 } 2623 2624 #endif /* KMP_GOMP_COMPAT */ 2625 2626 /* ------------------------------------------------------------------------ */ 2627