1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 /* Dynamic scheduling initialization and dispatch. 14 * 15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 16 * it may change values between parallel regions. __kmp_max_nth 17 * is the largest value __kmp_nth may take, 1 is the smallest. 18 */ 19 20 #include "kmp.h" 21 #include "kmp_error.h" 22 #include "kmp_i18n.h" 23 #include "kmp_itt.h" 24 #include "kmp_stats.h" 25 #include "kmp_str.h" 26 #if KMP_USE_X87CONTROL 27 #include <float.h> 28 #endif 29 #include "kmp_lock.h" 30 #include "kmp_dispatch.h" 31 #if KMP_USE_HIER_SCHED 32 #include "kmp_dispatch_hier.h" 33 #endif 34 35 #if OMPT_SUPPORT 36 #include "ompt-specific.h" 37 #endif 38 39 /* ------------------------------------------------------------------------ */ 40 /* ------------------------------------------------------------------------ */ 41 42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 43 kmp_info_t *th; 44 45 KMP_DEBUG_ASSERT(gtid_ref); 46 47 if (__kmp_env_consistency_check) { 48 th = __kmp_threads[*gtid_ref]; 49 if (th->th.th_root->r.r_active && 50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 51 #if KMP_USE_DYNAMIC_LOCK 52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 53 #else 54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 55 #endif 56 } 57 } 58 } 59 60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 61 kmp_info_t *th; 62 63 if (__kmp_env_consistency_check) { 64 th = __kmp_threads[*gtid_ref]; 65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 67 } 68 } 69 } 70 71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC 72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule, 73 bool use_hier = false) { 74 // Pick up the nonmonotonic/monotonic bits from the scheduling type 75 // TODO: make nonmonotonic when static_steal is fixed 76 int monotonicity = SCHEDULE_MONOTONIC; 77 78 // Let default be monotonic for executables 79 // compiled with OpenMP* 4.5 or less compilers 80 if (loc->get_openmp_version() < 50) 81 monotonicity = SCHEDULE_MONOTONIC; 82 83 if (use_hier) 84 monotonicity = SCHEDULE_MONOTONIC; 85 else if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 86 monotonicity = SCHEDULE_NONMONOTONIC; 87 else if (SCHEDULE_HAS_MONOTONIC(schedule)) 88 monotonicity = SCHEDULE_MONOTONIC; 89 90 return monotonicity; 91 } 92 93 // Initialize a dispatch_private_info_template<T> buffer for a particular 94 // type of schedule,chunk. The loop description is found in lb (lower bound), 95 // ub (upper bound), and st (stride). nproc is the number of threads relevant 96 // to the scheduling (often the number of threads in a team, but not always if 97 // hierarchical scheduling is used). tid is the id of the thread calling 98 // the function within the group of nproc threads. It will have a value 99 // between 0 and nproc - 1. This is often just the thread id within a team, but 100 // is not necessarily the case when using hierarchical scheduling. 101 // loc is the source file location of the corresponding loop 102 // gtid is the global thread id 103 template <typename T> 104 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 105 dispatch_private_info_template<T> *pr, 106 enum sched_type schedule, T lb, T ub, 107 typename traits_t<T>::signed_t st, 108 #if USE_ITT_BUILD 109 kmp_uint64 *cur_chunk, 110 #endif 111 typename traits_t<T>::signed_t chunk, 112 T nproc, T tid) { 113 typedef typename traits_t<T>::unsigned_t UT; 114 typedef typename traits_t<T>::floating_t DBL; 115 116 int active; 117 T tc; 118 kmp_info_t *th; 119 kmp_team_t *team; 120 int monotonicity; 121 bool use_hier; 122 123 #ifdef KMP_DEBUG 124 typedef typename traits_t<T>::signed_t ST; 125 { 126 char *buff; 127 // create format specifiers before the debug output 128 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 129 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 130 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 131 traits_t<T>::spec, traits_t<T>::spec, 132 traits_t<ST>::spec, traits_t<ST>::spec, 133 traits_t<T>::spec, traits_t<T>::spec); 134 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 135 __kmp_str_free(&buff); 136 } 137 #endif 138 /* setup data */ 139 th = __kmp_threads[gtid]; 140 team = th->th.th_team; 141 active = !team->t.t_serialized; 142 143 #if USE_ITT_BUILD 144 int itt_need_metadata_reporting = 145 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 146 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 147 team->t.t_active_level == 1; 148 #endif 149 150 #if KMP_USE_HIER_SCHED 151 use_hier = pr->flags.use_hier; 152 #else 153 use_hier = false; 154 #endif 155 156 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */ 157 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 158 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 159 160 /* Pick up the nomerge/ordered bits from the scheduling type */ 161 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 162 pr->flags.nomerge = TRUE; 163 schedule = 164 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 165 } else { 166 pr->flags.nomerge = FALSE; 167 } 168 pr->type_size = traits_t<T>::type_size; // remember the size of variables 169 if (kmp_ord_lower & schedule) { 170 pr->flags.ordered = TRUE; 171 schedule = 172 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 173 } else { 174 pr->flags.ordered = FALSE; 175 } 176 // Ordered overrides nonmonotonic 177 if (pr->flags.ordered) { 178 monotonicity = SCHEDULE_MONOTONIC; 179 } 180 181 if (schedule == kmp_sch_static) { 182 schedule = __kmp_static; 183 } else { 184 if (schedule == kmp_sch_runtime) { 185 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 186 // not specified) 187 schedule = team->t.t_sched.r_sched_type; 188 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 189 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 190 // Detail the schedule if needed (global controls are differentiated 191 // appropriately) 192 if (schedule == kmp_sch_guided_chunked) { 193 schedule = __kmp_guided; 194 } else if (schedule == kmp_sch_static) { 195 schedule = __kmp_static; 196 } 197 // Use the chunk size specified by OMP_SCHEDULE (or default if not 198 // specified) 199 chunk = team->t.t_sched.chunk; 200 #if USE_ITT_BUILD 201 if (cur_chunk) 202 *cur_chunk = chunk; 203 #endif 204 #ifdef KMP_DEBUG 205 { 206 char *buff; 207 // create format specifiers before the debug output 208 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 209 "schedule:%%d chunk:%%%s\n", 210 traits_t<ST>::spec); 211 KD_TRACE(10, (buff, gtid, schedule, chunk)); 212 __kmp_str_free(&buff); 213 } 214 #endif 215 } else { 216 if (schedule == kmp_sch_guided_chunked) { 217 schedule = __kmp_guided; 218 } 219 if (chunk <= 0) { 220 chunk = KMP_DEFAULT_CHUNK; 221 } 222 } 223 224 if (schedule == kmp_sch_auto) { 225 // mapping and differentiation: in the __kmp_do_serial_initialize() 226 schedule = __kmp_auto; 227 #ifdef KMP_DEBUG 228 { 229 char *buff; 230 // create format specifiers before the debug output 231 buff = __kmp_str_format( 232 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 233 "schedule:%%d chunk:%%%s\n", 234 traits_t<ST>::spec); 235 KD_TRACE(10, (buff, gtid, schedule, chunk)); 236 __kmp_str_free(&buff); 237 } 238 #endif 239 } 240 #if KMP_STATIC_STEAL_ENABLED 241 // map nonmonotonic:dynamic to static steal 242 if (schedule == kmp_sch_dynamic_chunked) { 243 if (monotonicity == SCHEDULE_NONMONOTONIC) 244 schedule = kmp_sch_static_steal; 245 } 246 #endif 247 /* guided analytical not safe for too many threads */ 248 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 249 schedule = kmp_sch_guided_iterative_chunked; 250 KMP_WARNING(DispatchManyThreads); 251 } 252 if (schedule == kmp_sch_runtime_simd) { 253 // compiler provides simd_width in the chunk parameter 254 schedule = team->t.t_sched.r_sched_type; 255 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 256 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 257 // Detail the schedule if needed (global controls are differentiated 258 // appropriately) 259 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 260 schedule == __kmp_static) { 261 schedule = kmp_sch_static_balanced_chunked; 262 } else { 263 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 264 schedule = kmp_sch_guided_simd; 265 } 266 chunk = team->t.t_sched.chunk * chunk; 267 } 268 #if USE_ITT_BUILD 269 if (cur_chunk) 270 *cur_chunk = chunk; 271 #endif 272 #ifdef KMP_DEBUG 273 { 274 char *buff; 275 // create format specifiers before the debug output 276 buff = __kmp_str_format( 277 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d" 278 " chunk:%%%s\n", 279 traits_t<ST>::spec); 280 KD_TRACE(10, (buff, gtid, schedule, chunk)); 281 __kmp_str_free(&buff); 282 } 283 #endif 284 } 285 pr->u.p.parm1 = chunk; 286 } 287 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 288 "unknown scheduling type"); 289 290 pr->u.p.count = 0; 291 292 if (__kmp_env_consistency_check) { 293 if (st == 0) { 294 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 295 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 296 } 297 } 298 // compute trip count 299 if (st == 1) { // most common case 300 if (ub >= lb) { 301 tc = ub - lb + 1; 302 } else { // ub < lb 303 tc = 0; // zero-trip 304 } 305 } else if (st < 0) { 306 if (lb >= ub) { 307 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 308 // where the division needs to be unsigned regardless of the result type 309 tc = (UT)(lb - ub) / (-st) + 1; 310 } else { // lb < ub 311 tc = 0; // zero-trip 312 } 313 } else { // st > 0 314 if (ub >= lb) { 315 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 316 // where the division needs to be unsigned regardless of the result type 317 tc = (UT)(ub - lb) / st + 1; 318 } else { // ub < lb 319 tc = 0; // zero-trip 320 } 321 } 322 323 #if KMP_STATS_ENABLED 324 if (KMP_MASTER_GTID(gtid)) { 325 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc); 326 } 327 #endif 328 329 pr->u.p.lb = lb; 330 pr->u.p.ub = ub; 331 pr->u.p.st = st; 332 pr->u.p.tc = tc; 333 334 #if KMP_OS_WINDOWS 335 pr->u.p.last_upper = ub + st; 336 #endif /* KMP_OS_WINDOWS */ 337 338 /* NOTE: only the active parallel region(s) has active ordered sections */ 339 340 if (active) { 341 if (pr->flags.ordered) { 342 pr->ordered_bumped = 0; 343 pr->u.p.ordered_lower = 1; 344 pr->u.p.ordered_upper = 0; 345 } 346 } 347 348 switch (schedule) { 349 #if (KMP_STATIC_STEAL_ENABLED) 350 case kmp_sch_static_steal: { 351 T ntc, init; 352 353 KD_TRACE(100, 354 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 355 gtid)); 356 357 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 358 if (nproc > 1 && ntc >= nproc) { 359 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 360 T id = tid; 361 T small_chunk, extras; 362 363 small_chunk = ntc / nproc; 364 extras = ntc % nproc; 365 366 init = id * small_chunk + (id < extras ? id : extras); 367 pr->u.p.count = init; 368 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 369 370 pr->u.p.parm2 = lb; 371 // parm3 is the number of times to attempt stealing which is 372 // proportional to the number of chunks per thread up until 373 // the maximum value of nproc. 374 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc); 375 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 376 pr->u.p.st = st; 377 if (traits_t<T>::type_size > 4) { 378 // AC: TODO: check if 16-byte CAS available and use it to 379 // improve performance (probably wait for explicit request 380 // before spending time on this). 381 // For now use dynamically allocated per-thread lock, 382 // free memory in __kmp_dispatch_next when status==0. 383 KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL); 384 pr->u.p.th_steal_lock = 385 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 386 __kmp_init_lock(pr->u.p.th_steal_lock); 387 } 388 break; 389 } else { 390 /* too few chunks: switching to kmp_sch_dynamic_chunked */ 391 schedule = kmp_sch_dynamic_chunked; 392 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to " 393 "kmp_sch_dynamic_chunked\n", 394 gtid)); 395 if (pr->u.p.parm1 <= 0) 396 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 397 break; 398 } // if 399 } // case 400 #endif 401 case kmp_sch_static_balanced: { 402 T init, limit; 403 404 KD_TRACE( 405 100, 406 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 407 gtid)); 408 409 if (nproc > 1) { 410 T id = tid; 411 412 if (tc < nproc) { 413 if (id < tc) { 414 init = id; 415 limit = id; 416 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 417 } else { 418 pr->u.p.count = 1; /* means no more chunks to execute */ 419 pr->u.p.parm1 = FALSE; 420 break; 421 } 422 } else { 423 T small_chunk = tc / nproc; 424 T extras = tc % nproc; 425 init = id * small_chunk + (id < extras ? id : extras); 426 limit = init + small_chunk - (id < extras ? 0 : 1); 427 pr->u.p.parm1 = (id == nproc - 1); 428 } 429 } else { 430 if (tc > 0) { 431 init = 0; 432 limit = tc - 1; 433 pr->u.p.parm1 = TRUE; 434 } else { 435 // zero trip count 436 pr->u.p.count = 1; /* means no more chunks to execute */ 437 pr->u.p.parm1 = FALSE; 438 break; 439 } 440 } 441 #if USE_ITT_BUILD 442 // Calculate chunk for metadata report 443 if (itt_need_metadata_reporting) 444 if (cur_chunk) 445 *cur_chunk = limit - init + 1; 446 #endif 447 if (st == 1) { 448 pr->u.p.lb = lb + init; 449 pr->u.p.ub = lb + limit; 450 } else { 451 // calculated upper bound, "ub" is user-defined upper bound 452 T ub_tmp = lb + limit * st; 453 pr->u.p.lb = lb + init * st; 454 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 455 // it exactly 456 if (st > 0) { 457 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 458 } else { 459 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 460 } 461 } 462 if (pr->flags.ordered) { 463 pr->u.p.ordered_lower = init; 464 pr->u.p.ordered_upper = limit; 465 } 466 break; 467 } // case 468 case kmp_sch_static_balanced_chunked: { 469 // similar to balanced, but chunk adjusted to multiple of simd width 470 T nth = nproc; 471 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 472 " -> falling-through to static_greedy\n", 473 gtid)); 474 schedule = kmp_sch_static_greedy; 475 if (nth > 1) 476 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 477 else 478 pr->u.p.parm1 = tc; 479 break; 480 } // case 481 case kmp_sch_guided_simd: 482 case kmp_sch_guided_iterative_chunked: { 483 KD_TRACE( 484 100, 485 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 486 " case\n", 487 gtid)); 488 489 if (nproc > 1) { 490 if ((2L * chunk + 1) * nproc >= tc) { 491 /* chunk size too large, switch to dynamic */ 492 schedule = kmp_sch_dynamic_chunked; 493 } else { 494 // when remaining iters become less than parm2 - switch to dynamic 495 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 496 *(double *)&pr->u.p.parm3 = 497 guided_flt_param / (double)nproc; // may occupy parm3 and parm4 498 } 499 } else { 500 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 501 "kmp_sch_static_greedy\n", 502 gtid)); 503 schedule = kmp_sch_static_greedy; 504 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 505 KD_TRACE( 506 100, 507 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 508 gtid)); 509 pr->u.p.parm1 = tc; 510 } // if 511 } // case 512 break; 513 case kmp_sch_guided_analytical_chunked: { 514 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 515 "kmp_sch_guided_analytical_chunked case\n", 516 gtid)); 517 518 if (nproc > 1) { 519 if ((2L * chunk + 1) * nproc >= tc) { 520 /* chunk size too large, switch to dynamic */ 521 schedule = kmp_sch_dynamic_chunked; 522 } else { 523 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 524 DBL x; 525 526 #if KMP_USE_X87CONTROL 527 /* Linux* OS already has 64-bit computation by default for long double, 528 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 529 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 530 instead of the default 53-bit. Even though long double doesn't work 531 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 532 expected to impact the correctness of the algorithm, but this has not 533 been mathematically proven. */ 534 // save original FPCW and set precision to 64-bit, as 535 // Windows* OS on IA-32 architecture defaults to 53-bit 536 unsigned int oldFpcw = _control87(0, 0); 537 _control87(_PC_64, _MCW_PC); // 0,0x30000 538 #endif 539 /* value used for comparison in solver for cross-over point */ 540 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 541 542 /* crossover point--chunk indexes equal to or greater than 543 this point switch to dynamic-style scheduling */ 544 UT cross; 545 546 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 547 x = 1.0 - 0.5 / (double)nproc; 548 549 #ifdef KMP_DEBUG 550 { // test natural alignment 551 struct _test_a { 552 char a; 553 union { 554 char b; 555 DBL d; 556 }; 557 } t; 558 ptrdiff_t natural_alignment = 559 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 560 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 561 // long)natural_alignment ); 562 KMP_DEBUG_ASSERT( 563 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 564 } 565 #endif // KMP_DEBUG 566 567 /* save the term in thread private dispatch structure */ 568 *(DBL *)&pr->u.p.parm3 = x; 569 570 /* solve for the crossover point to the nearest integer i for which C_i 571 <= chunk */ 572 { 573 UT left, right, mid; 574 long double p; 575 576 /* estimate initial upper and lower bound */ 577 578 /* doesn't matter what value right is as long as it is positive, but 579 it affects performance of the solver */ 580 right = 229; 581 p = __kmp_pow<UT>(x, right); 582 if (p > target) { 583 do { 584 p *= p; 585 right <<= 1; 586 } while (p > target && right < (1 << 27)); 587 /* lower bound is previous (failed) estimate of upper bound */ 588 left = right >> 1; 589 } else { 590 left = 0; 591 } 592 593 /* bisection root-finding method */ 594 while (left + 1 < right) { 595 mid = (left + right) / 2; 596 if (__kmp_pow<UT>(x, mid) > target) { 597 left = mid; 598 } else { 599 right = mid; 600 } 601 } // while 602 cross = right; 603 } 604 /* assert sanity of computed crossover point */ 605 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 606 __kmp_pow<UT>(x, cross) <= target); 607 608 /* save the crossover point in thread private dispatch structure */ 609 pr->u.p.parm2 = cross; 610 611 // C75803 612 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 613 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 614 #else 615 #define GUIDED_ANALYTICAL_WORKAROUND (x) 616 #endif 617 /* dynamic-style scheduling offset */ 618 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 619 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 620 cross * chunk; 621 #if KMP_USE_X87CONTROL 622 // restore FPCW 623 _control87(oldFpcw, _MCW_PC); 624 #endif 625 } // if 626 } else { 627 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 628 "kmp_sch_static_greedy\n", 629 gtid)); 630 schedule = kmp_sch_static_greedy; 631 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 632 pr->u.p.parm1 = tc; 633 } // if 634 } // case 635 break; 636 case kmp_sch_static_greedy: 637 KD_TRACE( 638 100, 639 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 640 gtid)); 641 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 642 break; 643 case kmp_sch_static_chunked: 644 case kmp_sch_dynamic_chunked: 645 if (pr->u.p.parm1 <= 0) { 646 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 647 } 648 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 649 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 650 gtid)); 651 break; 652 case kmp_sch_trapezoidal: { 653 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 654 655 T parm1, parm2, parm3, parm4; 656 KD_TRACE(100, 657 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 658 gtid)); 659 660 parm1 = chunk; 661 662 /* F : size of the first cycle */ 663 parm2 = (tc / (2 * nproc)); 664 665 if (parm2 < 1) { 666 parm2 = 1; 667 } 668 669 /* L : size of the last cycle. Make sure the last cycle is not larger 670 than the first cycle. */ 671 if (parm1 < 1) { 672 parm1 = 1; 673 } else if (parm1 > parm2) { 674 parm1 = parm2; 675 } 676 677 /* N : number of cycles */ 678 parm3 = (parm2 + parm1); 679 parm3 = (2 * tc + parm3 - 1) / parm3; 680 681 if (parm3 < 2) { 682 parm3 = 2; 683 } 684 685 /* sigma : decreasing incr of the trapezoid */ 686 parm4 = (parm3 - 1); 687 parm4 = (parm2 - parm1) / parm4; 688 689 // pointless check, because parm4 >= 0 always 690 // if ( parm4 < 0 ) { 691 // parm4 = 0; 692 //} 693 694 pr->u.p.parm1 = parm1; 695 pr->u.p.parm2 = parm2; 696 pr->u.p.parm3 = parm3; 697 pr->u.p.parm4 = parm4; 698 } // case 699 break; 700 701 default: { 702 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 703 KMP_HNT(GetNewerLibrary), // Hint 704 __kmp_msg_null // Variadic argument list terminator 705 ); 706 } break; 707 } // switch 708 pr->schedule = schedule; 709 } 710 711 #if KMP_USE_HIER_SCHED 712 template <typename T> 713 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 714 typename traits_t<T>::signed_t st); 715 template <> 716 inline void 717 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 718 kmp_int32 ub, kmp_int32 st) { 719 __kmp_dispatch_init_hierarchy<kmp_int32>( 720 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 721 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 722 } 723 template <> 724 inline void 725 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 726 kmp_uint32 ub, kmp_int32 st) { 727 __kmp_dispatch_init_hierarchy<kmp_uint32>( 728 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 729 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 730 } 731 template <> 732 inline void 733 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 734 kmp_int64 ub, kmp_int64 st) { 735 __kmp_dispatch_init_hierarchy<kmp_int64>( 736 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 737 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 738 } 739 template <> 740 inline void 741 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 742 kmp_uint64 ub, kmp_int64 st) { 743 __kmp_dispatch_init_hierarchy<kmp_uint64>( 744 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 745 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 746 } 747 748 // free all the hierarchy scheduling memory associated with the team 749 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 750 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 751 for (int i = 0; i < num_disp_buff; ++i) { 752 // type does not matter here so use kmp_int32 753 auto sh = 754 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 755 &team->t.t_disp_buffer[i]); 756 if (sh->hier) { 757 sh->hier->deallocate(); 758 __kmp_free(sh->hier); 759 } 760 } 761 } 762 #endif 763 764 // UT - unsigned flavor of T, ST - signed flavor of T, 765 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 766 template <typename T> 767 static void 768 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 769 T ub, typename traits_t<T>::signed_t st, 770 typename traits_t<T>::signed_t chunk, int push_ws) { 771 typedef typename traits_t<T>::unsigned_t UT; 772 773 int active; 774 kmp_info_t *th; 775 kmp_team_t *team; 776 kmp_uint32 my_buffer_index; 777 dispatch_private_info_template<T> *pr; 778 dispatch_shared_info_template<T> volatile *sh; 779 780 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 781 sizeof(dispatch_private_info)); 782 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 783 sizeof(dispatch_shared_info)); 784 __kmp_assert_valid_gtid(gtid); 785 786 if (!TCR_4(__kmp_init_parallel)) 787 __kmp_parallel_initialize(); 788 789 __kmp_resume_if_soft_paused(); 790 791 #if INCLUDE_SSC_MARKS 792 SSC_MARK_DISPATCH_INIT(); 793 #endif 794 #ifdef KMP_DEBUG 795 typedef typename traits_t<T>::signed_t ST; 796 { 797 char *buff; 798 // create format specifiers before the debug output 799 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 800 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 801 traits_t<ST>::spec, traits_t<T>::spec, 802 traits_t<T>::spec, traits_t<ST>::spec); 803 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 804 __kmp_str_free(&buff); 805 } 806 #endif 807 /* setup data */ 808 th = __kmp_threads[gtid]; 809 team = th->th.th_team; 810 active = !team->t.t_serialized; 811 th->th.th_ident = loc; 812 813 // Any half-decent optimizer will remove this test when the blocks are empty 814 // since the macros expand to nothing 815 // when statistics are disabled. 816 if (schedule == __kmp_static) { 817 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 818 } else { 819 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 820 } 821 822 #if KMP_USE_HIER_SCHED 823 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 824 // Hierarchical scheduling does not work with ordered, so if ordered is 825 // detected, then revert back to threaded scheduling. 826 bool ordered; 827 enum sched_type my_sched = schedule; 828 my_buffer_index = th->th.th_dispatch->th_disp_index; 829 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 830 &th->th.th_dispatch 831 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 832 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 833 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 834 my_sched = 835 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 836 ordered = (kmp_ord_lower & my_sched); 837 if (pr->flags.use_hier) { 838 if (ordered) { 839 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 840 "Disabling hierarchical scheduling.\n", 841 gtid)); 842 pr->flags.use_hier = FALSE; 843 } 844 } 845 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 846 // Don't use hierarchical for ordered parallel loops and don't 847 // use the runtime hierarchy if one was specified in the program 848 if (!ordered && !pr->flags.use_hier) 849 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 850 } 851 #endif // KMP_USE_HIER_SCHED 852 853 #if USE_ITT_BUILD 854 kmp_uint64 cur_chunk = chunk; 855 int itt_need_metadata_reporting = 856 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 857 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 858 team->t.t_active_level == 1; 859 #endif 860 if (!active) { 861 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 862 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 863 } else { 864 KMP_DEBUG_ASSERT(th->th.th_dispatch == 865 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 866 867 my_buffer_index = th->th.th_dispatch->th_disp_index++; 868 869 /* What happens when number of threads changes, need to resize buffer? */ 870 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 871 &th->th.th_dispatch 872 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 873 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 874 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 875 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 876 my_buffer_index)); 877 } 878 879 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 880 #if USE_ITT_BUILD 881 &cur_chunk, 882 #endif 883 chunk, (T)th->th.th_team_nproc, 884 (T)th->th.th_info.ds.ds_tid); 885 if (active) { 886 if (pr->flags.ordered == 0) { 887 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 888 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 889 } else { 890 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 891 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 892 } 893 } 894 895 if (active) { 896 /* The name of this buffer should be my_buffer_index when it's free to use 897 * it */ 898 899 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 900 "sh->buffer_index:%d\n", 901 gtid, my_buffer_index, sh->buffer_index)); 902 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, 903 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 904 // Note: KMP_WAIT() cannot be used there: buffer index and 905 // my_buffer_index are *always* 32-bit integers. 906 KMP_MB(); /* is this necessary? */ 907 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 908 "sh->buffer_index:%d\n", 909 gtid, my_buffer_index, sh->buffer_index)); 910 911 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 912 th->th.th_dispatch->th_dispatch_sh_current = 913 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 914 #if USE_ITT_BUILD 915 if (pr->flags.ordered) { 916 __kmp_itt_ordered_init(gtid); 917 } 918 // Report loop metadata 919 if (itt_need_metadata_reporting) { 920 // Only report metadata by master of active team at level 1 921 kmp_uint64 schedtype = 0; 922 switch (schedule) { 923 case kmp_sch_static_chunked: 924 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 925 break; 926 case kmp_sch_static_greedy: 927 cur_chunk = pr->u.p.parm1; 928 break; 929 case kmp_sch_dynamic_chunked: 930 schedtype = 1; 931 break; 932 case kmp_sch_guided_iterative_chunked: 933 case kmp_sch_guided_analytical_chunked: 934 case kmp_sch_guided_simd: 935 schedtype = 2; 936 break; 937 default: 938 // Should we put this case under "static"? 939 // case kmp_sch_static_steal: 940 schedtype = 3; 941 break; 942 } 943 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 944 } 945 #if KMP_USE_HIER_SCHED 946 if (pr->flags.use_hier) { 947 pr->u.p.count = 0; 948 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 949 } 950 #endif // KMP_USER_HIER_SCHED 951 #endif /* USE_ITT_BUILD */ 952 } 953 954 #ifdef KMP_DEBUG 955 { 956 char *buff; 957 // create format specifiers before the debug output 958 buff = __kmp_str_format( 959 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 960 "lb:%%%s ub:%%%s" 961 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 962 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 963 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 964 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 965 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 966 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 967 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 968 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 969 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 970 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 971 __kmp_str_free(&buff); 972 } 973 #endif 974 #if (KMP_STATIC_STEAL_ENABLED) 975 // It cannot be guaranteed that after execution of a loop with some other 976 // schedule kind all the parm3 variables will contain the same value. Even if 977 // all parm3 will be the same, it still exists a bad case like using 0 and 1 978 // rather than program life-time increment. So the dedicated variable is 979 // required. The 'static_steal_counter' is used. 980 if (pr->schedule == kmp_sch_static_steal) { 981 // Other threads will inspect this variable when searching for a victim. 982 // This is a flag showing that other threads may steal from this thread 983 // since then. 984 volatile T *p = &pr->u.p.static_steal_counter; 985 *p = *p + 1; 986 } 987 #endif // ( KMP_STATIC_STEAL_ENABLED ) 988 989 #if OMPT_SUPPORT && OMPT_OPTIONAL 990 if (ompt_enabled.ompt_callback_work) { 991 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 992 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 993 ompt_callbacks.ompt_callback(ompt_callback_work)( 994 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 995 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 996 } 997 #endif 998 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 999 } 1000 1001 /* For ordered loops, either __kmp_dispatch_finish() should be called after 1002 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1003 * every chunk of iterations. If the ordered section(s) were not executed 1004 * for this iteration (or every iteration in this chunk), we need to set the 1005 * ordered iteration counters so that the next thread can proceed. */ 1006 template <typename UT> 1007 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 1008 typedef typename traits_t<UT>::signed_t ST; 1009 __kmp_assert_valid_gtid(gtid); 1010 kmp_info_t *th = __kmp_threads[gtid]; 1011 1012 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1013 if (!th->th.th_team->t.t_serialized) { 1014 1015 dispatch_private_info_template<UT> *pr = 1016 reinterpret_cast<dispatch_private_info_template<UT> *>( 1017 th->th.th_dispatch->th_dispatch_pr_current); 1018 dispatch_shared_info_template<UT> volatile *sh = 1019 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1020 th->th.th_dispatch->th_dispatch_sh_current); 1021 KMP_DEBUG_ASSERT(pr); 1022 KMP_DEBUG_ASSERT(sh); 1023 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1024 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1025 1026 if (pr->ordered_bumped) { 1027 KD_TRACE( 1028 1000, 1029 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1030 gtid)); 1031 pr->ordered_bumped = 0; 1032 } else { 1033 UT lower = pr->u.p.ordered_lower; 1034 1035 #ifdef KMP_DEBUG 1036 { 1037 char *buff; 1038 // create format specifiers before the debug output 1039 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1040 "ordered_iteration:%%%s lower:%%%s\n", 1041 traits_t<UT>::spec, traits_t<UT>::spec); 1042 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1043 __kmp_str_free(&buff); 1044 } 1045 #endif 1046 1047 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1048 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1049 KMP_MB(); /* is this necessary? */ 1050 #ifdef KMP_DEBUG 1051 { 1052 char *buff; 1053 // create format specifiers before the debug output 1054 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1055 "ordered_iteration:%%%s lower:%%%s\n", 1056 traits_t<UT>::spec, traits_t<UT>::spec); 1057 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1058 __kmp_str_free(&buff); 1059 } 1060 #endif 1061 1062 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1063 } // if 1064 } // if 1065 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1066 } 1067 1068 #ifdef KMP_GOMP_COMPAT 1069 1070 template <typename UT> 1071 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1072 typedef typename traits_t<UT>::signed_t ST; 1073 __kmp_assert_valid_gtid(gtid); 1074 kmp_info_t *th = __kmp_threads[gtid]; 1075 1076 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1077 if (!th->th.th_team->t.t_serialized) { 1078 // int cid; 1079 dispatch_private_info_template<UT> *pr = 1080 reinterpret_cast<dispatch_private_info_template<UT> *>( 1081 th->th.th_dispatch->th_dispatch_pr_current); 1082 dispatch_shared_info_template<UT> volatile *sh = 1083 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1084 th->th.th_dispatch->th_dispatch_sh_current); 1085 KMP_DEBUG_ASSERT(pr); 1086 KMP_DEBUG_ASSERT(sh); 1087 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1088 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1089 1090 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1091 UT lower = pr->u.p.ordered_lower; 1092 UT upper = pr->u.p.ordered_upper; 1093 UT inc = upper - lower + 1; 1094 1095 if (pr->ordered_bumped == inc) { 1096 KD_TRACE( 1097 1000, 1098 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1099 gtid)); 1100 pr->ordered_bumped = 0; 1101 } else { 1102 inc -= pr->ordered_bumped; 1103 1104 #ifdef KMP_DEBUG 1105 { 1106 char *buff; 1107 // create format specifiers before the debug output 1108 buff = __kmp_str_format( 1109 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1110 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1111 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1112 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1113 __kmp_str_free(&buff); 1114 } 1115 #endif 1116 1117 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1118 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1119 1120 KMP_MB(); /* is this necessary? */ 1121 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1122 "ordered_bumped to zero\n", 1123 gtid)); 1124 pr->ordered_bumped = 0; 1125 //!!!!! TODO check if the inc should be unsigned, or signed??? 1126 #ifdef KMP_DEBUG 1127 { 1128 char *buff; 1129 // create format specifiers before the debug output 1130 buff = __kmp_str_format( 1131 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1132 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1133 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1134 traits_t<UT>::spec); 1135 KD_TRACE(1000, 1136 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1137 __kmp_str_free(&buff); 1138 } 1139 #endif 1140 1141 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1142 } 1143 // } 1144 } 1145 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1146 } 1147 1148 #endif /* KMP_GOMP_COMPAT */ 1149 1150 template <typename T> 1151 int __kmp_dispatch_next_algorithm(int gtid, 1152 dispatch_private_info_template<T> *pr, 1153 dispatch_shared_info_template<T> volatile *sh, 1154 kmp_int32 *p_last, T *p_lb, T *p_ub, 1155 typename traits_t<T>::signed_t *p_st, T nproc, 1156 T tid) { 1157 typedef typename traits_t<T>::unsigned_t UT; 1158 typedef typename traits_t<T>::signed_t ST; 1159 typedef typename traits_t<T>::floating_t DBL; 1160 int status = 0; 1161 bool last = false; 1162 T start; 1163 ST incr; 1164 UT limit, trip, init; 1165 kmp_info_t *th = __kmp_threads[gtid]; 1166 kmp_team_t *team = th->th.th_team; 1167 1168 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1169 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1170 KMP_DEBUG_ASSERT(pr); 1171 KMP_DEBUG_ASSERT(sh); 1172 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1173 #ifdef KMP_DEBUG 1174 { 1175 char *buff; 1176 // create format specifiers before the debug output 1177 buff = 1178 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1179 "sh:%%p nproc:%%%s tid:%%%s\n", 1180 traits_t<T>::spec, traits_t<T>::spec); 1181 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1182 __kmp_str_free(&buff); 1183 } 1184 #endif 1185 1186 // zero trip count 1187 if (pr->u.p.tc == 0) { 1188 KD_TRACE(10, 1189 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1190 "zero status:%d\n", 1191 gtid, status)); 1192 return 0; 1193 } 1194 1195 switch (pr->schedule) { 1196 #if (KMP_STATIC_STEAL_ENABLED) 1197 case kmp_sch_static_steal: { 1198 T chunk = pr->u.p.parm1; 1199 1200 KD_TRACE(100, 1201 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1202 gtid)); 1203 1204 trip = pr->u.p.tc - 1; 1205 1206 if (traits_t<T>::type_size > 4) { 1207 // use lock for 8-byte and CAS for 4-byte induction 1208 // variable. TODO (optional): check and use 16-byte CAS 1209 kmp_lock_t *lck = pr->u.p.th_steal_lock; 1210 KMP_DEBUG_ASSERT(lck != NULL); 1211 if (pr->u.p.count < (UT)pr->u.p.ub) { 1212 __kmp_acquire_lock(lck, gtid); 1213 // try to get own chunk of iterations 1214 init = (pr->u.p.count)++; 1215 status = (init < (UT)pr->u.p.ub); 1216 __kmp_release_lock(lck, gtid); 1217 } else { 1218 status = 0; // no own chunks 1219 } 1220 if (!status) { // try to steal 1221 kmp_info_t **other_threads = team->t.t_threads; 1222 T while_limit = pr->u.p.parm3; 1223 T while_index = 0; 1224 T id = pr->u.p.static_steal_counter; // loop id 1225 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1226 __kmp_dispatch_num_buffers; // current loop index 1227 // note: victim thread can potentially execute another loop 1228 // TODO: algorithm of searching for a victim 1229 // should be cleaned up and measured 1230 while ((!status) && (while_limit != ++while_index)) { 1231 dispatch_private_info_template<T> *victim; 1232 T remaining; 1233 T victimIdx = pr->u.p.parm4; 1234 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1235 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1236 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1237 KMP_DEBUG_ASSERT(victim); 1238 while ((victim == pr || id != victim->u.p.static_steal_counter) && 1239 oldVictimIdx != victimIdx) { 1240 victimIdx = (victimIdx + 1) % nproc; 1241 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1242 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1243 KMP_DEBUG_ASSERT(victim); 1244 } 1245 if (victim == pr || id != victim->u.p.static_steal_counter) { 1246 continue; // try once more (nproc attempts in total) 1247 // no victim is ready yet to participate in stealing 1248 // because no victim passed kmp_init_dispatch yet 1249 } 1250 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1251 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1252 continue; // not enough chunks to steal, goto next victim 1253 } 1254 1255 lck = victim->u.p.th_steal_lock; 1256 KMP_ASSERT(lck != NULL); 1257 __kmp_acquire_lock(lck, gtid); 1258 limit = victim->u.p.ub; // keep initial ub 1259 if (victim->u.p.count >= limit || 1260 (remaining = limit - victim->u.p.count) < 2) { 1261 __kmp_release_lock(lck, gtid); 1262 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1263 continue; // not enough chunks to steal 1264 } 1265 // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or 1266 // by 1 1267 if (remaining > 3) { 1268 // steal 1/4 of remaining 1269 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1270 init = (victim->u.p.ub -= (remaining >> 2)); 1271 } else { 1272 // steal 1 chunk of 2 or 3 remaining 1273 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1274 init = (victim->u.p.ub -= 1); 1275 } 1276 __kmp_release_lock(lck, gtid); 1277 1278 KMP_DEBUG_ASSERT(init + 1 <= limit); 1279 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1280 status = 1; 1281 while_index = 0; 1282 // now update own count and ub with stolen range but init chunk 1283 __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid); 1284 pr->u.p.count = init + 1; 1285 pr->u.p.ub = limit; 1286 __kmp_release_lock(pr->u.p.th_steal_lock, gtid); 1287 } // while (search for victim) 1288 } // if (try to find victim and steal) 1289 } else { 1290 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1291 typedef union { 1292 struct { 1293 UT count; 1294 T ub; 1295 } p; 1296 kmp_int64 b; 1297 } union_i4; 1298 // All operations on 'count' or 'ub' must be combined atomically 1299 // together. 1300 { 1301 union_i4 vold, vnew; 1302 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1303 vnew = vold; 1304 vnew.p.count++; 1305 while (!KMP_COMPARE_AND_STORE_ACQ64( 1306 (volatile kmp_int64 *)&pr->u.p.count, 1307 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1308 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1309 KMP_CPU_PAUSE(); 1310 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1311 vnew = vold; 1312 vnew.p.count++; 1313 } 1314 vnew = vold; 1315 init = vnew.p.count; 1316 status = (init < (UT)vnew.p.ub); 1317 } 1318 1319 if (!status) { 1320 kmp_info_t **other_threads = team->t.t_threads; 1321 T while_limit = pr->u.p.parm3; 1322 T while_index = 0; 1323 T id = pr->u.p.static_steal_counter; // loop id 1324 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1325 __kmp_dispatch_num_buffers; // current loop index 1326 // note: victim thread can potentially execute another loop 1327 // TODO: algorithm of searching for a victim 1328 // should be cleaned up and measured 1329 while ((!status) && (while_limit != ++while_index)) { 1330 dispatch_private_info_template<T> *victim; 1331 union_i4 vold, vnew; 1332 T remaining; 1333 T victimIdx = pr->u.p.parm4; 1334 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1335 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1336 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1337 KMP_DEBUG_ASSERT(victim); 1338 while ((victim == pr || id != victim->u.p.static_steal_counter) && 1339 oldVictimIdx != victimIdx) { 1340 victimIdx = (victimIdx + 1) % nproc; 1341 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1342 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1343 KMP_DEBUG_ASSERT(victim); 1344 } 1345 if (victim == pr || id != victim->u.p.static_steal_counter) { 1346 continue; // try once more (nproc attempts in total) 1347 // no victim is ready yet to participate in stealing 1348 // because no victim passed kmp_init_dispatch yet 1349 } 1350 pr->u.p.parm4 = victimIdx; // new victim found 1351 while (1) { // CAS loop if victim has enough chunks to steal 1352 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1353 vnew = vold; 1354 1355 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1356 if (vnew.p.count >= (UT)vnew.p.ub || 1357 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1358 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1359 break; // not enough chunks to steal, goto next victim 1360 } 1361 if (remaining > 3) { 1362 // try to steal 1/4 of remaining 1363 vnew.p.ub -= remaining >> 2; 1364 } else { 1365 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1366 } 1367 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1368 // TODO: Should this be acquire or release? 1369 if (KMP_COMPARE_AND_STORE_ACQ64( 1370 (volatile kmp_int64 *)&victim->u.p.count, 1371 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1372 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1373 // stealing succeeded 1374 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1375 vold.p.ub - vnew.p.ub); 1376 status = 1; 1377 while_index = 0; 1378 // now update own count and ub 1379 init = vnew.p.ub; 1380 vold.p.count = init + 1; 1381 #if KMP_ARCH_X86 1382 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1383 #else 1384 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1385 #endif 1386 break; 1387 } // if (check CAS result) 1388 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt 1389 } // while (try to steal from particular victim) 1390 } // while (search for victim) 1391 } // if (try to find victim and steal) 1392 } // if (4-byte induction variable) 1393 if (!status) { 1394 *p_lb = 0; 1395 *p_ub = 0; 1396 if (p_st != NULL) 1397 *p_st = 0; 1398 } else { 1399 start = pr->u.p.parm2; 1400 init *= chunk; 1401 limit = chunk + init - 1; 1402 incr = pr->u.p.st; 1403 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1404 1405 KMP_DEBUG_ASSERT(init <= trip); 1406 if ((last = (limit >= trip)) != 0) 1407 limit = trip; 1408 if (p_st != NULL) 1409 *p_st = incr; 1410 1411 if (incr == 1) { 1412 *p_lb = start + init; 1413 *p_ub = start + limit; 1414 } else { 1415 *p_lb = start + init * incr; 1416 *p_ub = start + limit * incr; 1417 } 1418 1419 if (pr->flags.ordered) { 1420 pr->u.p.ordered_lower = init; 1421 pr->u.p.ordered_upper = limit; 1422 } // if 1423 } // if 1424 break; 1425 } // case 1426 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1427 case kmp_sch_static_balanced: { 1428 KD_TRACE( 1429 10, 1430 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1431 gtid)); 1432 /* check if thread has any iteration to do */ 1433 if ((status = !pr->u.p.count) != 0) { 1434 pr->u.p.count = 1; 1435 *p_lb = pr->u.p.lb; 1436 *p_ub = pr->u.p.ub; 1437 last = (pr->u.p.parm1 != 0); 1438 if (p_st != NULL) 1439 *p_st = pr->u.p.st; 1440 } else { /* no iterations to do */ 1441 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1442 } 1443 } // case 1444 break; 1445 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1446 merged here */ 1447 case kmp_sch_static_chunked: { 1448 T parm1; 1449 1450 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1451 "kmp_sch_static_[affinity|chunked] case\n", 1452 gtid)); 1453 parm1 = pr->u.p.parm1; 1454 1455 trip = pr->u.p.tc - 1; 1456 init = parm1 * (pr->u.p.count + tid); 1457 1458 if ((status = (init <= trip)) != 0) { 1459 start = pr->u.p.lb; 1460 incr = pr->u.p.st; 1461 limit = parm1 + init - 1; 1462 1463 if ((last = (limit >= trip)) != 0) 1464 limit = trip; 1465 1466 if (p_st != NULL) 1467 *p_st = incr; 1468 1469 pr->u.p.count += nproc; 1470 1471 if (incr == 1) { 1472 *p_lb = start + init; 1473 *p_ub = start + limit; 1474 } else { 1475 *p_lb = start + init * incr; 1476 *p_ub = start + limit * incr; 1477 } 1478 1479 if (pr->flags.ordered) { 1480 pr->u.p.ordered_lower = init; 1481 pr->u.p.ordered_upper = limit; 1482 } // if 1483 } // if 1484 } // case 1485 break; 1486 1487 case kmp_sch_dynamic_chunked: { 1488 T chunk = pr->u.p.parm1; 1489 1490 KD_TRACE( 1491 100, 1492 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1493 gtid)); 1494 1495 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1496 trip = pr->u.p.tc - 1; 1497 1498 if ((status = (init <= trip)) == 0) { 1499 *p_lb = 0; 1500 *p_ub = 0; 1501 if (p_st != NULL) 1502 *p_st = 0; 1503 } else { 1504 start = pr->u.p.lb; 1505 limit = chunk + init - 1; 1506 incr = pr->u.p.st; 1507 1508 if ((last = (limit >= trip)) != 0) 1509 limit = trip; 1510 1511 if (p_st != NULL) 1512 *p_st = incr; 1513 1514 if (incr == 1) { 1515 *p_lb = start + init; 1516 *p_ub = start + limit; 1517 } else { 1518 *p_lb = start + init * incr; 1519 *p_ub = start + limit * incr; 1520 } 1521 1522 if (pr->flags.ordered) { 1523 pr->u.p.ordered_lower = init; 1524 pr->u.p.ordered_upper = limit; 1525 } // if 1526 } // if 1527 } // case 1528 break; 1529 1530 case kmp_sch_guided_iterative_chunked: { 1531 T chunkspec = pr->u.p.parm1; 1532 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1533 "iterative case\n", 1534 gtid)); 1535 trip = pr->u.p.tc; 1536 // Start atomic part of calculations 1537 while (1) { 1538 ST remaining; // signed, because can be < 0 1539 init = sh->u.s.iteration; // shared value 1540 remaining = trip - init; 1541 if (remaining <= 0) { // AC: need to compare with 0 first 1542 // nothing to do, don't try atomic op 1543 status = 0; 1544 break; 1545 } 1546 if ((T)remaining < 1547 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1548 // use dynamic-style schedule 1549 // atomically increment iterations, get old value 1550 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1551 (ST)chunkspec); 1552 remaining = trip - init; 1553 if (remaining <= 0) { 1554 status = 0; // all iterations got by other threads 1555 } else { 1556 // got some iterations to work on 1557 status = 1; 1558 if ((T)remaining > chunkspec) { 1559 limit = init + chunkspec - 1; 1560 } else { 1561 last = true; // the last chunk 1562 limit = init + remaining - 1; 1563 } // if 1564 } // if 1565 break; 1566 } // if 1567 limit = init + (UT)((double)remaining * 1568 *(double *)&pr->u.p.parm3); // divide by K*nproc 1569 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1570 (ST)init, (ST)limit)) { 1571 // CAS was successful, chunk obtained 1572 status = 1; 1573 --limit; 1574 break; 1575 } // if 1576 } // while 1577 if (status != 0) { 1578 start = pr->u.p.lb; 1579 incr = pr->u.p.st; 1580 if (p_st != NULL) 1581 *p_st = incr; 1582 *p_lb = start + init * incr; 1583 *p_ub = start + limit * incr; 1584 if (pr->flags.ordered) { 1585 pr->u.p.ordered_lower = init; 1586 pr->u.p.ordered_upper = limit; 1587 } // if 1588 } else { 1589 *p_lb = 0; 1590 *p_ub = 0; 1591 if (p_st != NULL) 1592 *p_st = 0; 1593 } // if 1594 } // case 1595 break; 1596 1597 case kmp_sch_guided_simd: { 1598 // same as iterative but curr-chunk adjusted to be multiple of given 1599 // chunk 1600 T chunk = pr->u.p.parm1; 1601 KD_TRACE(100, 1602 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1603 gtid)); 1604 trip = pr->u.p.tc; 1605 // Start atomic part of calculations 1606 while (1) { 1607 ST remaining; // signed, because can be < 0 1608 init = sh->u.s.iteration; // shared value 1609 remaining = trip - init; 1610 if (remaining <= 0) { // AC: need to compare with 0 first 1611 status = 0; // nothing to do, don't try atomic op 1612 break; 1613 } 1614 KMP_DEBUG_ASSERT(init % chunk == 0); 1615 // compare with K*nproc*(chunk+1), K=2 by default 1616 if ((T)remaining < pr->u.p.parm2) { 1617 // use dynamic-style schedule 1618 // atomically increment iterations, get old value 1619 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1620 (ST)chunk); 1621 remaining = trip - init; 1622 if (remaining <= 0) { 1623 status = 0; // all iterations got by other threads 1624 } else { 1625 // got some iterations to work on 1626 status = 1; 1627 if ((T)remaining > chunk) { 1628 limit = init + chunk - 1; 1629 } else { 1630 last = true; // the last chunk 1631 limit = init + remaining - 1; 1632 } // if 1633 } // if 1634 break; 1635 } // if 1636 // divide by K*nproc 1637 UT span; 1638 __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3), 1639 &span); 1640 UT rem = span % chunk; 1641 if (rem) // adjust so that span%chunk == 0 1642 span += chunk - rem; 1643 limit = init + span; 1644 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1645 (ST)init, (ST)limit)) { 1646 // CAS was successful, chunk obtained 1647 status = 1; 1648 --limit; 1649 break; 1650 } // if 1651 } // while 1652 if (status != 0) { 1653 start = pr->u.p.lb; 1654 incr = pr->u.p.st; 1655 if (p_st != NULL) 1656 *p_st = incr; 1657 *p_lb = start + init * incr; 1658 *p_ub = start + limit * incr; 1659 if (pr->flags.ordered) { 1660 pr->u.p.ordered_lower = init; 1661 pr->u.p.ordered_upper = limit; 1662 } // if 1663 } else { 1664 *p_lb = 0; 1665 *p_ub = 0; 1666 if (p_st != NULL) 1667 *p_st = 0; 1668 } // if 1669 } // case 1670 break; 1671 1672 case kmp_sch_guided_analytical_chunked: { 1673 T chunkspec = pr->u.p.parm1; 1674 UT chunkIdx; 1675 #if KMP_USE_X87CONTROL 1676 /* for storing original FPCW value for Windows* OS on 1677 IA-32 architecture 8-byte version */ 1678 unsigned int oldFpcw; 1679 unsigned int fpcwSet = 0; 1680 #endif 1681 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1682 "kmp_sch_guided_analytical_chunked case\n", 1683 gtid)); 1684 1685 trip = pr->u.p.tc; 1686 1687 KMP_DEBUG_ASSERT(nproc > 1); 1688 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1689 1690 while (1) { /* this while loop is a safeguard against unexpected zero 1691 chunk sizes */ 1692 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1693 if (chunkIdx >= (UT)pr->u.p.parm2) { 1694 --trip; 1695 /* use dynamic-style scheduling */ 1696 init = chunkIdx * chunkspec + pr->u.p.count; 1697 /* need to verify init > 0 in case of overflow in the above 1698 * calculation */ 1699 if ((status = (init > 0 && init <= trip)) != 0) { 1700 limit = init + chunkspec - 1; 1701 1702 if ((last = (limit >= trip)) != 0) 1703 limit = trip; 1704 } 1705 break; 1706 } else { 1707 /* use exponential-style scheduling */ 1708 /* The following check is to workaround the lack of long double precision on 1709 Windows* OS. 1710 This check works around the possible effect that init != 0 for chunkIdx == 0. 1711 */ 1712 #if KMP_USE_X87CONTROL 1713 /* If we haven't already done so, save original 1714 FPCW and set precision to 64-bit, as Windows* OS 1715 on IA-32 architecture defaults to 53-bit */ 1716 if (!fpcwSet) { 1717 oldFpcw = _control87(0, 0); 1718 _control87(_PC_64, _MCW_PC); 1719 fpcwSet = 0x30000; 1720 } 1721 #endif 1722 if (chunkIdx) { 1723 init = __kmp_dispatch_guided_remaining<T>( 1724 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1725 KMP_DEBUG_ASSERT(init); 1726 init = trip - init; 1727 } else 1728 init = 0; 1729 limit = trip - __kmp_dispatch_guided_remaining<T>( 1730 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1731 KMP_ASSERT(init <= limit); 1732 if (init < limit) { 1733 KMP_DEBUG_ASSERT(limit <= trip); 1734 --limit; 1735 status = 1; 1736 break; 1737 } // if 1738 } // if 1739 } // while (1) 1740 #if KMP_USE_X87CONTROL 1741 /* restore FPCW if necessary 1742 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1743 */ 1744 if (fpcwSet && (oldFpcw & fpcwSet)) 1745 _control87(oldFpcw, _MCW_PC); 1746 #endif 1747 if (status != 0) { 1748 start = pr->u.p.lb; 1749 incr = pr->u.p.st; 1750 if (p_st != NULL) 1751 *p_st = incr; 1752 *p_lb = start + init * incr; 1753 *p_ub = start + limit * incr; 1754 if (pr->flags.ordered) { 1755 pr->u.p.ordered_lower = init; 1756 pr->u.p.ordered_upper = limit; 1757 } 1758 } else { 1759 *p_lb = 0; 1760 *p_ub = 0; 1761 if (p_st != NULL) 1762 *p_st = 0; 1763 } 1764 } // case 1765 break; 1766 1767 case kmp_sch_trapezoidal: { 1768 UT index; 1769 T parm2 = pr->u.p.parm2; 1770 T parm3 = pr->u.p.parm3; 1771 T parm4 = pr->u.p.parm4; 1772 KD_TRACE(100, 1773 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1774 gtid)); 1775 1776 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1777 1778 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1779 trip = pr->u.p.tc - 1; 1780 1781 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1782 *p_lb = 0; 1783 *p_ub = 0; 1784 if (p_st != NULL) 1785 *p_st = 0; 1786 } else { 1787 start = pr->u.p.lb; 1788 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1789 incr = pr->u.p.st; 1790 1791 if ((last = (limit >= trip)) != 0) 1792 limit = trip; 1793 1794 if (p_st != NULL) 1795 *p_st = incr; 1796 1797 if (incr == 1) { 1798 *p_lb = start + init; 1799 *p_ub = start + limit; 1800 } else { 1801 *p_lb = start + init * incr; 1802 *p_ub = start + limit * incr; 1803 } 1804 1805 if (pr->flags.ordered) { 1806 pr->u.p.ordered_lower = init; 1807 pr->u.p.ordered_upper = limit; 1808 } // if 1809 } // if 1810 } // case 1811 break; 1812 default: { 1813 status = 0; // to avoid complaints on uninitialized variable use 1814 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1815 KMP_HNT(GetNewerLibrary), // Hint 1816 __kmp_msg_null // Variadic argument list terminator 1817 ); 1818 } break; 1819 } // switch 1820 if (p_last) 1821 *p_last = last; 1822 #ifdef KMP_DEBUG 1823 if (pr->flags.ordered) { 1824 char *buff; 1825 // create format specifiers before the debug output 1826 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1827 "ordered_lower:%%%s ordered_upper:%%%s\n", 1828 traits_t<UT>::spec, traits_t<UT>::spec); 1829 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1830 __kmp_str_free(&buff); 1831 } 1832 { 1833 char *buff; 1834 // create format specifiers before the debug output 1835 buff = __kmp_str_format( 1836 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1837 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1838 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1839 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1840 __kmp_str_free(&buff); 1841 } 1842 #endif 1843 return status; 1844 } 1845 1846 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1847 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1848 is not called. */ 1849 #if OMPT_SUPPORT && OMPT_OPTIONAL 1850 #define OMPT_LOOP_END \ 1851 if (status == 0) { \ 1852 if (ompt_enabled.ompt_callback_work) { \ 1853 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1854 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1855 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1856 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1857 &(task_info->task_data), 0, codeptr); \ 1858 } \ 1859 } 1860 // TODO: implement count 1861 #else 1862 #define OMPT_LOOP_END // no-op 1863 #endif 1864 1865 #if KMP_STATS_ENABLED 1866 #define KMP_STATS_LOOP_END \ 1867 { \ 1868 kmp_int64 u, l, t, i; \ 1869 l = (kmp_int64)(*p_lb); \ 1870 u = (kmp_int64)(*p_ub); \ 1871 i = (kmp_int64)(pr->u.p.st); \ 1872 if (status == 0) { \ 1873 t = 0; \ 1874 KMP_POP_PARTITIONED_TIMER(); \ 1875 } else if (i == 1) { \ 1876 if (u >= l) \ 1877 t = u - l + 1; \ 1878 else \ 1879 t = 0; \ 1880 } else if (i < 0) { \ 1881 if (l >= u) \ 1882 t = (l - u) / (-i) + 1; \ 1883 else \ 1884 t = 0; \ 1885 } else { \ 1886 if (u >= l) \ 1887 t = (u - l) / i + 1; \ 1888 else \ 1889 t = 0; \ 1890 } \ 1891 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1892 } 1893 #else 1894 #define KMP_STATS_LOOP_END /* Nothing */ 1895 #endif 1896 1897 template <typename T> 1898 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1899 T *p_lb, T *p_ub, 1900 typename traits_t<T>::signed_t *p_st 1901 #if OMPT_SUPPORT && OMPT_OPTIONAL 1902 , 1903 void *codeptr 1904 #endif 1905 ) { 1906 1907 typedef typename traits_t<T>::unsigned_t UT; 1908 typedef typename traits_t<T>::signed_t ST; 1909 // This is potentially slightly misleading, schedule(runtime) will appear here 1910 // even if the actual runtime schedule is static. (Which points out a 1911 // disadvantage of schedule(runtime): even when static scheduling is used it 1912 // costs more than a compile time choice to use static scheduling would.) 1913 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 1914 1915 int status; 1916 dispatch_private_info_template<T> *pr; 1917 __kmp_assert_valid_gtid(gtid); 1918 kmp_info_t *th = __kmp_threads[gtid]; 1919 kmp_team_t *team = th->th.th_team; 1920 1921 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1922 KD_TRACE( 1923 1000, 1924 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 1925 gtid, p_lb, p_ub, p_st, p_last)); 1926 1927 if (team->t.t_serialized) { 1928 /* NOTE: serialize this dispatch because we are not at the active level */ 1929 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1930 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1931 KMP_DEBUG_ASSERT(pr); 1932 1933 if ((status = (pr->u.p.tc != 0)) == 0) { 1934 *p_lb = 0; 1935 *p_ub = 0; 1936 // if ( p_last != NULL ) 1937 // *p_last = 0; 1938 if (p_st != NULL) 1939 *p_st = 0; 1940 if (__kmp_env_consistency_check) { 1941 if (pr->pushed_ws != ct_none) { 1942 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1943 } 1944 } 1945 } else if (pr->flags.nomerge) { 1946 kmp_int32 last; 1947 T start; 1948 UT limit, trip, init; 1949 ST incr; 1950 T chunk = pr->u.p.parm1; 1951 1952 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1953 gtid)); 1954 1955 init = chunk * pr->u.p.count++; 1956 trip = pr->u.p.tc - 1; 1957 1958 if ((status = (init <= trip)) == 0) { 1959 *p_lb = 0; 1960 *p_ub = 0; 1961 // if ( p_last != NULL ) 1962 // *p_last = 0; 1963 if (p_st != NULL) 1964 *p_st = 0; 1965 if (__kmp_env_consistency_check) { 1966 if (pr->pushed_ws != ct_none) { 1967 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1968 } 1969 } 1970 } else { 1971 start = pr->u.p.lb; 1972 limit = chunk + init - 1; 1973 incr = pr->u.p.st; 1974 1975 if ((last = (limit >= trip)) != 0) { 1976 limit = trip; 1977 #if KMP_OS_WINDOWS 1978 pr->u.p.last_upper = pr->u.p.ub; 1979 #endif /* KMP_OS_WINDOWS */ 1980 } 1981 if (p_last != NULL) 1982 *p_last = last; 1983 if (p_st != NULL) 1984 *p_st = incr; 1985 if (incr == 1) { 1986 *p_lb = start + init; 1987 *p_ub = start + limit; 1988 } else { 1989 *p_lb = start + init * incr; 1990 *p_ub = start + limit * incr; 1991 } 1992 1993 if (pr->flags.ordered) { 1994 pr->u.p.ordered_lower = init; 1995 pr->u.p.ordered_upper = limit; 1996 #ifdef KMP_DEBUG 1997 { 1998 char *buff; 1999 // create format specifiers before the debug output 2000 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2001 "ordered_lower:%%%s ordered_upper:%%%s\n", 2002 traits_t<UT>::spec, traits_t<UT>::spec); 2003 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2004 pr->u.p.ordered_upper)); 2005 __kmp_str_free(&buff); 2006 } 2007 #endif 2008 } // if 2009 } // if 2010 } else { 2011 pr->u.p.tc = 0; 2012 *p_lb = pr->u.p.lb; 2013 *p_ub = pr->u.p.ub; 2014 #if KMP_OS_WINDOWS 2015 pr->u.p.last_upper = *p_ub; 2016 #endif /* KMP_OS_WINDOWS */ 2017 if (p_last != NULL) 2018 *p_last = TRUE; 2019 if (p_st != NULL) 2020 *p_st = pr->u.p.st; 2021 } // if 2022 #ifdef KMP_DEBUG 2023 { 2024 char *buff; 2025 // create format specifiers before the debug output 2026 buff = __kmp_str_format( 2027 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 2028 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 2029 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2030 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, 2031 (p_last ? *p_last : 0), status)); 2032 __kmp_str_free(&buff); 2033 } 2034 #endif 2035 #if INCLUDE_SSC_MARKS 2036 SSC_MARK_DISPATCH_NEXT(); 2037 #endif 2038 OMPT_LOOP_END; 2039 KMP_STATS_LOOP_END; 2040 return status; 2041 } else { 2042 kmp_int32 last = 0; 2043 dispatch_shared_info_template<T> volatile *sh; 2044 2045 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2046 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2047 2048 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2049 th->th.th_dispatch->th_dispatch_pr_current); 2050 KMP_DEBUG_ASSERT(pr); 2051 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2052 th->th.th_dispatch->th_dispatch_sh_current); 2053 KMP_DEBUG_ASSERT(sh); 2054 2055 #if KMP_USE_HIER_SCHED 2056 if (pr->flags.use_hier) 2057 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2058 else 2059 #endif // KMP_USE_HIER_SCHED 2060 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2061 p_st, th->th.th_team_nproc, 2062 th->th.th_info.ds.ds_tid); 2063 // status == 0: no more iterations to execute 2064 if (status == 0) { 2065 UT num_done; 2066 2067 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2068 #ifdef KMP_DEBUG 2069 { 2070 char *buff; 2071 // create format specifiers before the debug output 2072 buff = __kmp_str_format( 2073 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2074 traits_t<UT>::spec); 2075 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2076 __kmp_str_free(&buff); 2077 } 2078 #endif 2079 2080 #if KMP_USE_HIER_SCHED 2081 pr->flags.use_hier = FALSE; 2082 #endif 2083 if ((ST)num_done == th->th.th_team_nproc - 1) { 2084 #if (KMP_STATIC_STEAL_ENABLED) 2085 if (pr->schedule == kmp_sch_static_steal && 2086 traits_t<T>::type_size > 4) { 2087 int i; 2088 int idx = (th->th.th_dispatch->th_disp_index - 1) % 2089 __kmp_dispatch_num_buffers; // current loop index 2090 kmp_info_t **other_threads = team->t.t_threads; 2091 // loop complete, safe to destroy locks used for stealing 2092 for (i = 0; i < th->th.th_team_nproc; ++i) { 2093 dispatch_private_info_template<T> *buf = 2094 reinterpret_cast<dispatch_private_info_template<T> *>( 2095 &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]); 2096 kmp_lock_t *lck = buf->u.p.th_steal_lock; 2097 KMP_ASSERT(lck != NULL); 2098 __kmp_destroy_lock(lck); 2099 __kmp_free(lck); 2100 buf->u.p.th_steal_lock = NULL; 2101 } 2102 } 2103 #endif 2104 /* NOTE: release this buffer to be reused */ 2105 2106 KMP_MB(); /* Flush all pending memory write invalidates. */ 2107 2108 sh->u.s.num_done = 0; 2109 sh->u.s.iteration = 0; 2110 2111 /* TODO replace with general release procedure? */ 2112 if (pr->flags.ordered) { 2113 sh->u.s.ordered_iteration = 0; 2114 } 2115 2116 KMP_MB(); /* Flush all pending memory write invalidates. */ 2117 2118 sh->buffer_index += __kmp_dispatch_num_buffers; 2119 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2120 gtid, sh->buffer_index)); 2121 2122 KMP_MB(); /* Flush all pending memory write invalidates. */ 2123 2124 } // if 2125 if (__kmp_env_consistency_check) { 2126 if (pr->pushed_ws != ct_none) { 2127 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2128 } 2129 } 2130 2131 th->th.th_dispatch->th_deo_fcn = NULL; 2132 th->th.th_dispatch->th_dxo_fcn = NULL; 2133 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2134 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2135 } // if (status == 0) 2136 #if KMP_OS_WINDOWS 2137 else if (last) { 2138 pr->u.p.last_upper = pr->u.p.ub; 2139 } 2140 #endif /* KMP_OS_WINDOWS */ 2141 if (p_last != NULL && status != 0) 2142 *p_last = last; 2143 } // if 2144 2145 #ifdef KMP_DEBUG 2146 { 2147 char *buff; 2148 // create format specifiers before the debug output 2149 buff = __kmp_str_format( 2150 "__kmp_dispatch_next: T#%%d normal case: " 2151 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2152 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2153 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2154 (p_last ? *p_last : 0), status)); 2155 __kmp_str_free(&buff); 2156 } 2157 #endif 2158 #if INCLUDE_SSC_MARKS 2159 SSC_MARK_DISPATCH_NEXT(); 2160 #endif 2161 OMPT_LOOP_END; 2162 KMP_STATS_LOOP_END; 2163 return status; 2164 } 2165 2166 template <typename T> 2167 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2168 kmp_int32 *plastiter, T *plower, T *pupper, 2169 typename traits_t<T>::signed_t incr) { 2170 typedef typename traits_t<T>::unsigned_t UT; 2171 kmp_uint32 team_id; 2172 kmp_uint32 nteams; 2173 UT trip_count; 2174 kmp_team_t *team; 2175 kmp_info_t *th; 2176 2177 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2178 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2179 #ifdef KMP_DEBUG 2180 typedef typename traits_t<T>::signed_t ST; 2181 { 2182 char *buff; 2183 // create format specifiers before the debug output 2184 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2185 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2186 traits_t<T>::spec, traits_t<T>::spec, 2187 traits_t<ST>::spec, traits_t<T>::spec); 2188 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2189 __kmp_str_free(&buff); 2190 } 2191 #endif 2192 2193 if (__kmp_env_consistency_check) { 2194 if (incr == 0) { 2195 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2196 loc); 2197 } 2198 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2199 // The loop is illegal. 2200 // Some zero-trip loops maintained by compiler, e.g.: 2201 // for(i=10;i<0;++i) // lower >= upper - run-time check 2202 // for(i=0;i>10;--i) // lower <= upper - run-time check 2203 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2204 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2205 // Compiler does not check the following illegal loops: 2206 // for(i=0;i<10;i+=incr) // where incr<0 2207 // for(i=10;i>0;i-=incr) // where incr<0 2208 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2209 } 2210 } 2211 __kmp_assert_valid_gtid(gtid); 2212 th = __kmp_threads[gtid]; 2213 team = th->th.th_team; 2214 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2215 nteams = th->th.th_teams_size.nteams; 2216 team_id = team->t.t_master_tid; 2217 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2218 2219 // compute global trip count 2220 if (incr == 1) { 2221 trip_count = *pupper - *plower + 1; 2222 } else if (incr == -1) { 2223 trip_count = *plower - *pupper + 1; 2224 } else if (incr > 0) { 2225 // upper-lower can exceed the limit of signed type 2226 trip_count = (UT)(*pupper - *plower) / incr + 1; 2227 } else { 2228 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2229 } 2230 2231 if (trip_count <= nteams) { 2232 KMP_DEBUG_ASSERT( 2233 __kmp_static == kmp_sch_static_greedy || 2234 __kmp_static == 2235 kmp_sch_static_balanced); // Unknown static scheduling type. 2236 // only some teams get single iteration, others get nothing 2237 if (team_id < trip_count) { 2238 *pupper = *plower = *plower + team_id * incr; 2239 } else { 2240 *plower = *pupper + incr; // zero-trip loop 2241 } 2242 if (plastiter != NULL) 2243 *plastiter = (team_id == trip_count - 1); 2244 } else { 2245 if (__kmp_static == kmp_sch_static_balanced) { 2246 UT chunk = trip_count / nteams; 2247 UT extras = trip_count % nteams; 2248 *plower += 2249 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2250 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2251 if (plastiter != NULL) 2252 *plastiter = (team_id == nteams - 1); 2253 } else { 2254 T chunk_inc_count = 2255 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2256 T upper = *pupper; 2257 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2258 // Unknown static scheduling type. 2259 *plower += team_id * chunk_inc_count; 2260 *pupper = *plower + chunk_inc_count - incr; 2261 // Check/correct bounds if needed 2262 if (incr > 0) { 2263 if (*pupper < *plower) 2264 *pupper = traits_t<T>::max_value; 2265 if (plastiter != NULL) 2266 *plastiter = *plower <= upper && *pupper > upper - incr; 2267 if (*pupper > upper) 2268 *pupper = upper; // tracker C73258 2269 } else { 2270 if (*pupper > *plower) 2271 *pupper = traits_t<T>::min_value; 2272 if (plastiter != NULL) 2273 *plastiter = *plower >= upper && *pupper < upper - incr; 2274 if (*pupper < upper) 2275 *pupper = upper; // tracker C73258 2276 } 2277 } 2278 } 2279 } 2280 2281 //----------------------------------------------------------------------------- 2282 // Dispatch routines 2283 // Transfer call to template< type T > 2284 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2285 // T lb, T ub, ST st, ST chunk ) 2286 extern "C" { 2287 2288 /*! 2289 @ingroup WORK_SHARING 2290 @{ 2291 @param loc Source location 2292 @param gtid Global thread id 2293 @param schedule Schedule type 2294 @param lb Lower bound 2295 @param ub Upper bound 2296 @param st Step (or increment if you prefer) 2297 @param chunk The chunk size to block with 2298 2299 This function prepares the runtime to start a dynamically scheduled for loop, 2300 saving the loop arguments. 2301 These functions are all identical apart from the types of the arguments. 2302 */ 2303 2304 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2305 enum sched_type schedule, kmp_int32 lb, 2306 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2307 KMP_DEBUG_ASSERT(__kmp_init_serial); 2308 #if OMPT_SUPPORT && OMPT_OPTIONAL 2309 OMPT_STORE_RETURN_ADDRESS(gtid); 2310 #endif 2311 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2312 } 2313 /*! 2314 See @ref __kmpc_dispatch_init_4 2315 */ 2316 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2317 enum sched_type schedule, kmp_uint32 lb, 2318 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2319 KMP_DEBUG_ASSERT(__kmp_init_serial); 2320 #if OMPT_SUPPORT && OMPT_OPTIONAL 2321 OMPT_STORE_RETURN_ADDRESS(gtid); 2322 #endif 2323 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2324 } 2325 2326 /*! 2327 See @ref __kmpc_dispatch_init_4 2328 */ 2329 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2330 enum sched_type schedule, kmp_int64 lb, 2331 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2332 KMP_DEBUG_ASSERT(__kmp_init_serial); 2333 #if OMPT_SUPPORT && OMPT_OPTIONAL 2334 OMPT_STORE_RETURN_ADDRESS(gtid); 2335 #endif 2336 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2337 } 2338 2339 /*! 2340 See @ref __kmpc_dispatch_init_4 2341 */ 2342 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2343 enum sched_type schedule, kmp_uint64 lb, 2344 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2345 KMP_DEBUG_ASSERT(__kmp_init_serial); 2346 #if OMPT_SUPPORT && OMPT_OPTIONAL 2347 OMPT_STORE_RETURN_ADDRESS(gtid); 2348 #endif 2349 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2350 } 2351 2352 /*! 2353 See @ref __kmpc_dispatch_init_4 2354 2355 Difference from __kmpc_dispatch_init set of functions is these functions 2356 are called for composite distribute parallel for construct. Thus before 2357 regular iterations dispatching we need to calc per-team iteration space. 2358 2359 These functions are all identical apart from the types of the arguments. 2360 */ 2361 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2362 enum sched_type schedule, kmp_int32 *p_last, 2363 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2364 kmp_int32 chunk) { 2365 KMP_DEBUG_ASSERT(__kmp_init_serial); 2366 #if OMPT_SUPPORT && OMPT_OPTIONAL 2367 OMPT_STORE_RETURN_ADDRESS(gtid); 2368 #endif 2369 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2370 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2371 } 2372 2373 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2374 enum sched_type schedule, kmp_int32 *p_last, 2375 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2376 kmp_int32 chunk) { 2377 KMP_DEBUG_ASSERT(__kmp_init_serial); 2378 #if OMPT_SUPPORT && OMPT_OPTIONAL 2379 OMPT_STORE_RETURN_ADDRESS(gtid); 2380 #endif 2381 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2382 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2383 } 2384 2385 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2386 enum sched_type schedule, kmp_int32 *p_last, 2387 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2388 kmp_int64 chunk) { 2389 KMP_DEBUG_ASSERT(__kmp_init_serial); 2390 #if OMPT_SUPPORT && OMPT_OPTIONAL 2391 OMPT_STORE_RETURN_ADDRESS(gtid); 2392 #endif 2393 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2394 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2395 } 2396 2397 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2398 enum sched_type schedule, kmp_int32 *p_last, 2399 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2400 kmp_int64 chunk) { 2401 KMP_DEBUG_ASSERT(__kmp_init_serial); 2402 #if OMPT_SUPPORT && OMPT_OPTIONAL 2403 OMPT_STORE_RETURN_ADDRESS(gtid); 2404 #endif 2405 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2406 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2407 } 2408 2409 /*! 2410 @param loc Source code location 2411 @param gtid Global thread id 2412 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2413 otherwise 2414 @param p_lb Pointer to the lower bound for the next chunk of work 2415 @param p_ub Pointer to the upper bound for the next chunk of work 2416 @param p_st Pointer to the stride for the next chunk of work 2417 @return one if there is work to be done, zero otherwise 2418 2419 Get the next dynamically allocated chunk of work for this thread. 2420 If there is no more work, then the lb,ub and stride need not be modified. 2421 */ 2422 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2423 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2424 #if OMPT_SUPPORT && OMPT_OPTIONAL 2425 OMPT_STORE_RETURN_ADDRESS(gtid); 2426 #endif 2427 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2428 #if OMPT_SUPPORT && OMPT_OPTIONAL 2429 , 2430 OMPT_LOAD_RETURN_ADDRESS(gtid) 2431 #endif 2432 ); 2433 } 2434 2435 /*! 2436 See @ref __kmpc_dispatch_next_4 2437 */ 2438 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2439 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2440 kmp_int32 *p_st) { 2441 #if OMPT_SUPPORT && OMPT_OPTIONAL 2442 OMPT_STORE_RETURN_ADDRESS(gtid); 2443 #endif 2444 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2445 #if OMPT_SUPPORT && OMPT_OPTIONAL 2446 , 2447 OMPT_LOAD_RETURN_ADDRESS(gtid) 2448 #endif 2449 ); 2450 } 2451 2452 /*! 2453 See @ref __kmpc_dispatch_next_4 2454 */ 2455 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2456 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2457 #if OMPT_SUPPORT && OMPT_OPTIONAL 2458 OMPT_STORE_RETURN_ADDRESS(gtid); 2459 #endif 2460 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2461 #if OMPT_SUPPORT && OMPT_OPTIONAL 2462 , 2463 OMPT_LOAD_RETURN_ADDRESS(gtid) 2464 #endif 2465 ); 2466 } 2467 2468 /*! 2469 See @ref __kmpc_dispatch_next_4 2470 */ 2471 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2472 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2473 kmp_int64 *p_st) { 2474 #if OMPT_SUPPORT && OMPT_OPTIONAL 2475 OMPT_STORE_RETURN_ADDRESS(gtid); 2476 #endif 2477 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2478 #if OMPT_SUPPORT && OMPT_OPTIONAL 2479 , 2480 OMPT_LOAD_RETURN_ADDRESS(gtid) 2481 #endif 2482 ); 2483 } 2484 2485 /*! 2486 @param loc Source code location 2487 @param gtid Global thread id 2488 2489 Mark the end of a dynamic loop. 2490 */ 2491 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2492 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2493 } 2494 2495 /*! 2496 See @ref __kmpc_dispatch_fini_4 2497 */ 2498 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2499 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2500 } 2501 2502 /*! 2503 See @ref __kmpc_dispatch_fini_4 2504 */ 2505 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2506 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2507 } 2508 2509 /*! 2510 See @ref __kmpc_dispatch_fini_4 2511 */ 2512 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2513 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2514 } 2515 /*! @} */ 2516 2517 //----------------------------------------------------------------------------- 2518 // Non-template routines from kmp_dispatch.cpp used in other sources 2519 2520 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2521 return value == checker; 2522 } 2523 2524 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2525 return value != checker; 2526 } 2527 2528 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2529 return value < checker; 2530 } 2531 2532 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2533 return value >= checker; 2534 } 2535 2536 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2537 return value <= checker; 2538 } 2539 2540 kmp_uint32 2541 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2542 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2543 void *obj // Higher-level synchronization object, or NULL. 2544 ) { 2545 // note: we may not belong to a team at this point 2546 volatile kmp_uint32 *spin = spinner; 2547 kmp_uint32 check = checker; 2548 kmp_uint32 spins; 2549 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2550 kmp_uint32 r; 2551 2552 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2553 KMP_INIT_YIELD(spins); 2554 // main wait spin loop 2555 while (!f(r = TCR_4(*spin), check)) { 2556 KMP_FSYNC_SPIN_PREPARE(obj); 2557 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2558 split. It causes problems with infinite recursion because of exit lock */ 2559 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2560 __kmp_abort_thread(); */ 2561 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2562 } 2563 KMP_FSYNC_SPIN_ACQUIRED(obj); 2564 return r; 2565 } 2566 2567 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, 2568 kmp_uint32 (*pred)(void *, kmp_uint32), 2569 void *obj // Higher-level synchronization object, or NULL. 2570 ) { 2571 // note: we may not belong to a team at this point 2572 void *spin = spinner; 2573 kmp_uint32 check = checker; 2574 kmp_uint32 spins; 2575 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2576 2577 KMP_FSYNC_SPIN_INIT(obj, spin); 2578 KMP_INIT_YIELD(spins); 2579 // main wait spin loop 2580 while (!f(spin, check)) { 2581 KMP_FSYNC_SPIN_PREPARE(obj); 2582 /* if we have waited a bit, or are noversubscribed, yield */ 2583 /* pause is in the following code */ 2584 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2585 } 2586 KMP_FSYNC_SPIN_ACQUIRED(obj); 2587 } 2588 2589 } // extern "C" 2590 2591 #ifdef KMP_GOMP_COMPAT 2592 2593 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2594 enum sched_type schedule, kmp_int32 lb, 2595 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2596 int push_ws) { 2597 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2598 push_ws); 2599 } 2600 2601 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2602 enum sched_type schedule, kmp_uint32 lb, 2603 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2604 int push_ws) { 2605 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2606 push_ws); 2607 } 2608 2609 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2610 enum sched_type schedule, kmp_int64 lb, 2611 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2612 int push_ws) { 2613 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2614 push_ws); 2615 } 2616 2617 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2618 enum sched_type schedule, kmp_uint64 lb, 2619 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2620 int push_ws) { 2621 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2622 push_ws); 2623 } 2624 2625 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2626 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2627 } 2628 2629 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2630 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2631 } 2632 2633 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2634 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2635 } 2636 2637 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2638 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2639 } 2640 2641 #endif /* KMP_GOMP_COMPAT */ 2642 2643 /* ------------------------------------------------------------------------ */ 2644