1 /* 2 * kmp_collapse.cpp -- loop collapse feature 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_error.h" 15 #include "kmp_i18n.h" 16 #include "kmp_itt.h" 17 #include "kmp_stats.h" 18 #include "kmp_str.h" 19 #include "kmp_collapse.h" 20 21 #if OMPT_SUPPORT 22 #include "ompt-specific.h" 23 #endif 24 25 // OMPTODO: different style of comments (see kmp_sched) 26 // OMPTODO: OMPT/OMPD 27 28 // avoid inadevertently using a library based abs 29 template <typename T> T __kmp_abs(const T val) { 30 return (val < 0) ? -val : val; 31 } 32 kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; } 33 kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; } 34 35 //---------------------------------------------------------------------------- 36 // Common functions for working with rectangular and non-rectangular loops 37 //---------------------------------------------------------------------------- 38 39 template <typename T> int __kmp_sign(T val) { 40 return (T(0) < val) - (val < T(0)); 41 } 42 43 template <typename T> class CollapseAllocator { 44 typedef T *pT; 45 46 private: 47 static const size_t allocaSize = 32; // size limit for stack allocations 48 // (8 bytes x 4 nested loops) 49 char stackAlloc[allocaSize]; 50 static constexpr size_t maxElemCount = allocaSize / sizeof(T); 51 pT pTAlloc; 52 53 public: 54 CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) { 55 if (n > maxElemCount) { 56 pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T))); 57 } 58 } 59 ~CollapseAllocator() { 60 if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) { 61 __kmp_free(pTAlloc); 62 } 63 } 64 T &operator[](int index) { return pTAlloc[index]; } 65 operator const pT() { return pTAlloc; } 66 }; 67 68 //----------Loop canonicalization--------------------------------------------- 69 70 // For loop nest (any shape): 71 // convert != to < or >; 72 // switch from using < or > to <= or >=. 73 // "bounds" array has to be allocated per thread. 74 // All other internal functions will work only with canonicalized loops. 75 template <typename T> 76 void kmp_canonicalize_one_loop_XX( 77 ident_t *loc, 78 /*in/out*/ bounds_infoXX_template<T> *bounds) { 79 80 if (__kmp_env_consistency_check) { 81 if (bounds->step == 0) { 82 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 83 loc); 84 } 85 } 86 87 if (bounds->comparison == comparison_t::comp_not_eq) { 88 // We can convert this to < or >, depends on the sign of the step: 89 if (bounds->step > 0) { 90 bounds->comparison = comparison_t::comp_less; 91 } else { 92 bounds->comparison = comparison_t::comp_greater; 93 } 94 } 95 96 if (bounds->comparison == comparison_t::comp_less) { 97 // Note: ub0 can be unsigned. Should be Ok to hit overflow here, 98 // because ub0 + ub1*j should be still positive (otherwise loop was not 99 // well formed) 100 bounds->ub0 -= 1; 101 bounds->comparison = comparison_t::comp_less_or_eq; 102 } else if (bounds->comparison == comparison_t::comp_greater) { 103 bounds->ub0 += 1; 104 bounds->comparison = comparison_t::comp_greater_or_eq; 105 } 106 } 107 108 // Canonicalize loop nest. original_bounds_nest is an array of length n. 109 void kmp_canonicalize_loop_nest(ident_t *loc, 110 /*in/out*/ bounds_info_t *original_bounds_nest, 111 kmp_index_t n) { 112 113 for (kmp_index_t ind = 0; ind < n; ++ind) { 114 auto bounds = &(original_bounds_nest[ind]); 115 116 switch (bounds->loop_type) { 117 case loop_type_t::loop_type_int32: 118 kmp_canonicalize_one_loop_XX<kmp_int32>( 119 loc, 120 /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds)); 121 break; 122 case loop_type_t::loop_type_uint32: 123 kmp_canonicalize_one_loop_XX<kmp_uint32>( 124 loc, 125 /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds)); 126 break; 127 case loop_type_t::loop_type_int64: 128 kmp_canonicalize_one_loop_XX<kmp_int64>( 129 loc, 130 /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds)); 131 break; 132 case loop_type_t::loop_type_uint64: 133 kmp_canonicalize_one_loop_XX<kmp_uint64>( 134 loc, 135 /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds)); 136 break; 137 default: 138 KMP_ASSERT(false); 139 } 140 } 141 } 142 143 //----------Calculating trip count on one level------------------------------- 144 145 // Calculate trip count on this loop level. 146 // We do this either for a rectangular loop nest, 147 // or after an adjustment bringing the loops to a parallelepiped shape. 148 // This number should not depend on the value of outer IV 149 // even if the formular has lb1 and ub1. 150 // Note: for non-rectangular loops don't use span for this, it's too big. 151 152 template <typename T> 153 kmp_loop_nest_iv_t kmp_calculate_trip_count_XX( 154 /*in/out*/ bounds_infoXX_template<T> *bounds) { 155 156 if (bounds->comparison == comparison_t::comp_less_or_eq) { 157 if (bounds->ub0 < bounds->lb0) { 158 // Note: after this we don't need to calculate inner loops, 159 // but that should be an edge case: 160 bounds->trip_count = 0; 161 } else { 162 // ub - lb may exceed signed type range; we need to cast to 163 // kmp_loop_nest_iv_t anyway 164 bounds->trip_count = 165 static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) / 166 __kmp_abs(bounds->step) + 167 1; 168 } 169 } else if (bounds->comparison == comparison_t::comp_greater_or_eq) { 170 if (bounds->lb0 < bounds->ub0) { 171 // Note: after this we don't need to calculate inner loops, 172 // but that should be an edge case: 173 bounds->trip_count = 0; 174 } else { 175 // lb - ub may exceed signed type range; we need to cast to 176 // kmp_loop_nest_iv_t anyway 177 bounds->trip_count = 178 static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) / 179 __kmp_abs(bounds->step) + 180 1; 181 } 182 } else { 183 KMP_ASSERT(false); 184 } 185 return bounds->trip_count; 186 } 187 188 // Calculate trip count on this loop level. 189 kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) { 190 191 kmp_loop_nest_iv_t trip_count = 0; 192 193 switch (bounds->loop_type) { 194 case loop_type_t::loop_type_int32: 195 trip_count = kmp_calculate_trip_count_XX<kmp_int32>( 196 /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds)); 197 break; 198 case loop_type_t::loop_type_uint32: 199 trip_count = kmp_calculate_trip_count_XX<kmp_uint32>( 200 /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds)); 201 break; 202 case loop_type_t::loop_type_int64: 203 trip_count = kmp_calculate_trip_count_XX<kmp_int64>( 204 /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds)); 205 break; 206 case loop_type_t::loop_type_uint64: 207 trip_count = kmp_calculate_trip_count_XX<kmp_uint64>( 208 /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds)); 209 break; 210 default: 211 KMP_ASSERT(false); 212 } 213 214 return trip_count; 215 } 216 217 //----------Trim original iv according to its type---------------------------- 218 219 // Trim original iv according to its type. 220 // Return kmp_uint64 value which can be easily used in all internal calculations 221 // And can be statically cast back to original type in user code. 222 kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) { 223 kmp_uint64 res = 0; 224 225 switch (loop_iv_type) { 226 case loop_type_t::loop_type_int8: 227 res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv)); 228 break; 229 case loop_type_t::loop_type_uint8: 230 res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv)); 231 break; 232 case loop_type_t::loop_type_int16: 233 res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv)); 234 break; 235 case loop_type_t::loop_type_uint16: 236 res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv)); 237 break; 238 case loop_type_t::loop_type_int32: 239 res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv)); 240 break; 241 case loop_type_t::loop_type_uint32: 242 res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv)); 243 break; 244 case loop_type_t::loop_type_int64: 245 res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv)); 246 break; 247 case loop_type_t::loop_type_uint64: 248 res = static_cast<kmp_uint64>(original_iv); 249 break; 250 default: 251 KMP_ASSERT(false); 252 } 253 254 return res; 255 } 256 257 //----------Compare two IVs (remember they have a type)----------------------- 258 259 bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1, 260 kmp_uint64 original_iv2) { 261 bool res = false; 262 263 switch (loop_iv_type) { 264 case loop_type_t::loop_type_int8: 265 res = static_cast<kmp_int8>(original_iv1) == 266 static_cast<kmp_int8>(original_iv2); 267 break; 268 case loop_type_t::loop_type_uint8: 269 res = static_cast<kmp_uint8>(original_iv1) == 270 static_cast<kmp_uint8>(original_iv2); 271 break; 272 case loop_type_t::loop_type_int16: 273 res = static_cast<kmp_int16>(original_iv1) == 274 static_cast<kmp_int16>(original_iv2); 275 break; 276 case loop_type_t::loop_type_uint16: 277 res = static_cast<kmp_uint16>(original_iv1) == 278 static_cast<kmp_uint16>(original_iv2); 279 break; 280 case loop_type_t::loop_type_int32: 281 res = static_cast<kmp_int32>(original_iv1) == 282 static_cast<kmp_int32>(original_iv2); 283 break; 284 case loop_type_t::loop_type_uint32: 285 res = static_cast<kmp_uint32>(original_iv1) == 286 static_cast<kmp_uint32>(original_iv2); 287 break; 288 case loop_type_t::loop_type_int64: 289 res = static_cast<kmp_int64>(original_iv1) == 290 static_cast<kmp_int64>(original_iv2); 291 break; 292 case loop_type_t::loop_type_uint64: 293 res = static_cast<kmp_uint64>(original_iv1) == 294 static_cast<kmp_uint64>(original_iv2); 295 break; 296 default: 297 KMP_ASSERT(false); 298 } 299 300 return res; 301 } 302 303 //----------Calculate original iv on one level-------------------------------- 304 305 // Return true if the point fits into upper bounds on this level, 306 // false otherwise 307 template <typename T> 308 bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds, 309 const kmp_point_t original_ivs, 310 kmp_index_t ind) { 311 312 T iv = static_cast<T>(original_ivs[ind]); 313 T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); 314 315 if (((bounds->comparison == comparison_t::comp_less_or_eq) && 316 (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) || 317 ((bounds->comparison == comparison_t::comp_greater_or_eq) && 318 (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) { 319 // The calculated point is outside of loop upper boundary: 320 return false; 321 } 322 323 return true; 324 } 325 326 // Calculate one iv corresponding to iteration on the level ind. 327 // Return true if it fits into lower-upper bounds on this level 328 // (if not, we need to re-calculate) 329 template <typename T> 330 bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds, 331 /*in/out*/ kmp_point_t original_ivs, 332 const kmp_iterations_t iterations, kmp_index_t ind, 333 bool start_with_lower_bound, bool checkBounds) { 334 335 kmp_uint64 temp = 0; 336 T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); 337 338 if (start_with_lower_bound) { 339 // we moved to the next iteration on one of outer loops, should start 340 // with the lower bound here: 341 temp = bounds->lb0 + bounds->lb1 * outer_iv; 342 } else { 343 auto iteration = iterations[ind]; 344 temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step; 345 } 346 347 // Now trim original iv according to its type: 348 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); 349 350 if (checkBounds) { 351 return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind); 352 } else { 353 return true; 354 } 355 } 356 357 bool kmp_calc_one_iv(const bounds_info_t *bounds, 358 /*in/out*/ kmp_point_t original_ivs, 359 const kmp_iterations_t iterations, kmp_index_t ind, 360 bool start_with_lower_bound, bool checkBounds) { 361 362 switch (bounds->loop_type) { 363 case loop_type_t::loop_type_int32: 364 return kmp_calc_one_iv_XX<kmp_int32>( 365 (bounds_infoXX_template<kmp_int32> *)(bounds), 366 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, 367 checkBounds); 368 break; 369 case loop_type_t::loop_type_uint32: 370 return kmp_calc_one_iv_XX<kmp_uint32>( 371 (bounds_infoXX_template<kmp_uint32> *)(bounds), 372 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, 373 checkBounds); 374 break; 375 case loop_type_t::loop_type_int64: 376 return kmp_calc_one_iv_XX<kmp_int64>( 377 (bounds_infoXX_template<kmp_int64> *)(bounds), 378 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, 379 checkBounds); 380 break; 381 case loop_type_t::loop_type_uint64: 382 return kmp_calc_one_iv_XX<kmp_uint64>( 383 (bounds_infoXX_template<kmp_uint64> *)(bounds), 384 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, 385 checkBounds); 386 break; 387 default: 388 KMP_ASSERT(false); 389 return false; 390 } 391 } 392 393 //----------Calculate original iv on one level for rectangular loop nest------ 394 395 // Calculate one iv corresponding to iteration on the level ind. 396 // Return true if it fits into lower-upper bounds on this level 397 // (if not, we need to re-calculate) 398 template <typename T> 399 void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds, 400 /*in/out*/ kmp_uint64 *original_ivs, 401 const kmp_iterations_t iterations, 402 kmp_index_t ind) { 403 404 auto iteration = iterations[ind]; 405 406 kmp_uint64 temp = 407 bounds->lb0 + 408 bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) + 409 iteration * bounds->step; 410 411 // Now trim original iv according to its type: 412 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); 413 } 414 415 void kmp_calc_one_iv_rectang(const bounds_info_t *bounds, 416 /*in/out*/ kmp_uint64 *original_ivs, 417 const kmp_iterations_t iterations, 418 kmp_index_t ind) { 419 420 switch (bounds->loop_type) { 421 case loop_type_t::loop_type_int32: 422 kmp_calc_one_iv_rectang_XX<kmp_int32>( 423 (bounds_infoXX_template<kmp_int32> *)(bounds), 424 /*in/out*/ original_ivs, iterations, ind); 425 break; 426 case loop_type_t::loop_type_uint32: 427 kmp_calc_one_iv_rectang_XX<kmp_uint32>( 428 (bounds_infoXX_template<kmp_uint32> *)(bounds), 429 /*in/out*/ original_ivs, iterations, ind); 430 break; 431 case loop_type_t::loop_type_int64: 432 kmp_calc_one_iv_rectang_XX<kmp_int64>( 433 (bounds_infoXX_template<kmp_int64> *)(bounds), 434 /*in/out*/ original_ivs, iterations, ind); 435 break; 436 case loop_type_t::loop_type_uint64: 437 kmp_calc_one_iv_rectang_XX<kmp_uint64>( 438 (bounds_infoXX_template<kmp_uint64> *)(bounds), 439 /*in/out*/ original_ivs, iterations, ind); 440 break; 441 default: 442 KMP_ASSERT(false); 443 } 444 } 445 446 //---------------------------------------------------------------------------- 447 // Rectangular loop nest 448 //---------------------------------------------------------------------------- 449 450 //----------Canonicalize loop nest and calculate trip count------------------- 451 452 // Canonicalize loop nest and calculate overall trip count. 453 // "bounds_nest" has to be allocated per thread. 454 // API will modify original bounds_nest array to bring it to a canonical form 455 // (only <= and >=, no !=, <, >). If the original loop nest was already in a 456 // canonical form there will be no changes to bounds in bounds_nest array 457 // (only trip counts will be calculated). 458 // Returns trip count of overall space. 459 extern "C" kmp_loop_nest_iv_t 460 __kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid, 461 /*in/out*/ bounds_info_t *original_bounds_nest, 462 kmp_index_t n) { 463 464 kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n); 465 466 kmp_loop_nest_iv_t total = 1; 467 468 for (kmp_index_t ind = 0; ind < n; ++ind) { 469 auto bounds = &(original_bounds_nest[ind]); 470 471 kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds); 472 total *= trip_count; 473 } 474 475 return total; 476 } 477 478 //----------Calculate old induction variables--------------------------------- 479 480 // Calculate old induction variables corresponding to overall new_iv. 481 // Note: original IV will be returned as if it had kmp_uint64 type, 482 // will have to be converted to original type in user code. 483 // Note: trip counts should be already calculated by 484 // __kmpc_process_loop_nest_rectang. 485 // OMPTODO: special case 2, 3 nested loops: either do different 486 // interface without array or possibly template this over n 487 extern "C" void 488 __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv, 489 const bounds_info_t *original_bounds_nest, 490 /*out*/ kmp_uint64 *original_ivs, 491 kmp_index_t n) { 492 493 CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); 494 495 // First, calc corresponding iteration in every original loop: 496 for (kmp_index_t ind = n; ind > 0;) { 497 --ind; 498 auto bounds = &(original_bounds_nest[ind]); 499 500 // should be optimized to OPDIVREM: 501 auto temp = new_iv / bounds->trip_count; 502 auto iteration = new_iv % bounds->trip_count; 503 new_iv = temp; 504 505 iterations[ind] = iteration; 506 } 507 KMP_ASSERT(new_iv == 0); 508 509 for (kmp_index_t ind = 0; ind < n; ++ind) { 510 auto bounds = &(original_bounds_nest[ind]); 511 512 kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind); 513 } 514 } 515 516 //---------------------------------------------------------------------------- 517 // Non-rectangular loop nest 518 //---------------------------------------------------------------------------- 519 520 //----------Calculate maximum possible span of iv values on one level--------- 521 522 // Calculate span for IV on this loop level for "<=" case. 523 // Note: it's for <= on this loop nest level, so lower bound should be smallest 524 // value, upper bound should be the biggest value. If the loop won't execute, 525 // 'smallest' may be bigger than 'biggest', but we'd better not switch them 526 // around. 527 template <typename T> 528 void kmp_calc_span_lessoreq_XX( 529 /* in/out*/ bounds_info_internalXX_template<T> *bounds, 530 /* in/out*/ bounds_info_internal_t *bounds_nest) { 531 532 typedef typename traits_t<T>::unsigned_t UT; 533 // typedef typename traits_t<T>::signed_t ST; 534 535 // typedef typename big_span_t span_t; 536 typedef T span_t; 537 538 auto &bbounds = bounds->b; 539 540 if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) { 541 // This dimention depends on one of previous ones; can't be the outermost 542 // one. 543 bounds_info_internalXX_template<T> *previous = 544 reinterpret_cast<bounds_info_internalXX_template<T> *>( 545 &(bounds_nest[bbounds.outer_iv])); 546 547 // OMPTODO: assert that T is compatible with loop variable type on 548 // 'previous' loop 549 550 { 551 span_t bound_candidate1 = 552 bbounds.lb0 + bbounds.lb1 * previous->span_smallest; 553 span_t bound_candidate2 = 554 bbounds.lb0 + bbounds.lb1 * previous->span_biggest; 555 if (bound_candidate1 < bound_candidate2) { 556 bounds->span_smallest = bound_candidate1; 557 } else { 558 bounds->span_smallest = bound_candidate2; 559 } 560 } 561 562 { 563 // We can't adjust the upper bound with respect to step, because 564 // lower bound might be off after adjustments 565 566 span_t bound_candidate1 = 567 bbounds.ub0 + bbounds.ub1 * previous->span_smallest; 568 span_t bound_candidate2 = 569 bbounds.ub0 + bbounds.ub1 * previous->span_biggest; 570 if (bound_candidate1 < bound_candidate2) { 571 bounds->span_biggest = bound_candidate2; 572 } else { 573 bounds->span_biggest = bound_candidate1; 574 } 575 } 576 } else { 577 // Rectangular: 578 bounds->span_smallest = bbounds.lb0; 579 bounds->span_biggest = bbounds.ub0; 580 } 581 if (!bounds->loop_bounds_adjusted) { 582 // Here it's safe to reduce the space to the multiply of step. 583 // OMPTODO: check if the formular is correct. 584 // Also check if it would be safe to do this if we didn't adjust left side. 585 bounds->span_biggest -= 586 (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs? 587 } 588 } 589 590 // Calculate span for IV on this loop level for ">=" case. 591 template <typename T> 592 void kmp_calc_span_greateroreq_XX( 593 /* in/out*/ bounds_info_internalXX_template<T> *bounds, 594 /* in/out*/ bounds_info_internal_t *bounds_nest) { 595 596 typedef typename traits_t<T>::unsigned_t UT; 597 // typedef typename traits_t<T>::signed_t ST; 598 599 // typedef typename big_span_t span_t; 600 typedef T span_t; 601 602 auto &bbounds = bounds->b; 603 604 if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) { 605 // This dimention depends on one of previous ones; can't be the outermost 606 // one. 607 bounds_info_internalXX_template<T> *previous = 608 reinterpret_cast<bounds_info_internalXX_template<T> *>( 609 &(bounds_nest[bbounds.outer_iv])); 610 611 // OMPTODO: assert that T is compatible with loop variable type on 612 // 'previous' loop 613 614 { 615 span_t bound_candidate1 = 616 bbounds.lb0 + bbounds.lb1 * previous->span_smallest; 617 span_t bound_candidate2 = 618 bbounds.lb0 + bbounds.lb1 * previous->span_biggest; 619 if (bound_candidate1 >= bound_candidate2) { 620 bounds->span_smallest = bound_candidate1; 621 } else { 622 bounds->span_smallest = bound_candidate2; 623 } 624 } 625 626 { 627 // We can't adjust the upper bound with respect to step, because 628 // lower bound might be off after adjustments 629 630 span_t bound_candidate1 = 631 bbounds.ub0 + bbounds.ub1 * previous->span_smallest; 632 span_t bound_candidate2 = 633 bbounds.ub0 + bbounds.ub1 * previous->span_biggest; 634 if (bound_candidate1 >= bound_candidate2) { 635 bounds->span_biggest = bound_candidate2; 636 } else { 637 bounds->span_biggest = bound_candidate1; 638 } 639 } 640 641 } else { 642 // Rectangular: 643 bounds->span_biggest = bbounds.lb0; 644 bounds->span_smallest = bbounds.ub0; 645 } 646 if (!bounds->loop_bounds_adjusted) { 647 // Here it's safe to reduce the space to the multiply of step. 648 // OMPTODO: check if the formular is correct. 649 // Also check if it would be safe to do this if we didn't adjust left side. 650 bounds->span_biggest -= 651 (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs? 652 } 653 } 654 655 // Calculate maximum possible span for IV on this loop level. 656 template <typename T> 657 void kmp_calc_span_XX( 658 /* in/out*/ bounds_info_internalXX_template<T> *bounds, 659 /* in/out*/ bounds_info_internal_t *bounds_nest) { 660 661 if (bounds->b.comparison == comparison_t::comp_less_or_eq) { 662 kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); 663 } else { 664 KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq); 665 kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); 666 } 667 } 668 669 //----------All initial processing of the loop nest--------------------------- 670 671 // Calculate new bounds for this loop level. 672 // To be able to work with the nest we need to get it to a parallelepiped shape. 673 // We need to stay in the original range of values, so that there will be no 674 // overflow, for that we'll adjust both upper and lower bounds as needed. 675 template <typename T> 676 void kmp_calc_new_bounds_XX( 677 /* in/out*/ bounds_info_internalXX_template<T> *bounds, 678 /* in/out*/ bounds_info_internal_t *bounds_nest) { 679 680 auto &bbounds = bounds->b; 681 682 if (bbounds.lb1 == bbounds.ub1) { 683 // Already parallel, no need to adjust: 684 bounds->loop_bounds_adjusted = false; 685 } else { 686 bounds->loop_bounds_adjusted = true; 687 688 T old_lb1 = bbounds.lb1; 689 T old_ub1 = bbounds.ub1; 690 691 if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) { 692 // With this shape we can adjust to a rectangle: 693 bbounds.lb1 = 0; 694 bbounds.ub1 = 0; 695 } else { 696 // get upper and lower bounds to be parallel 697 // with values in the old range. 698 // Note: abs didn't work here. 699 if (((old_lb1 < 0) && (old_lb1 < old_ub1)) || 700 ((old_lb1 > 0) && (old_lb1 > old_ub1))) { 701 bbounds.lb1 = old_ub1; 702 } else { 703 bbounds.ub1 = old_lb1; 704 } 705 } 706 707 // Now need to adjust lb0, ub0, otherwise in some cases space will shrink. 708 // The idea here that for this IV we are now getting the same span 709 // irrespective of the previous IV value. 710 bounds_info_internalXX_template<T> *previous = 711 reinterpret_cast<bounds_info_internalXX_template<T> *>( 712 &bounds_nest[bbounds.outer_iv]); 713 714 if (bbounds.comparison == comparison_t::comp_less_or_eq) { 715 if (old_lb1 < bbounds.lb1) { 716 KMP_ASSERT(old_lb1 < 0); 717 // The length is good on outer_iv biggest number, 718 // can use it to find where to move the lower bound: 719 720 T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest; 721 bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space? 722 // e.g. it was 0?? (same below) 723 } else if (old_lb1 > bbounds.lb1) { 724 // still need to move lower bound: 725 T add = (old_lb1 - bbounds.lb1) * previous->span_smallest; 726 bbounds.lb0 += add; 727 } 728 729 if (old_ub1 > bbounds.ub1) { 730 KMP_ASSERT(old_ub1 > 0); 731 // The length is good on outer_iv biggest number, 732 // can use it to find where to move upper bound: 733 734 T add = (old_ub1 - bbounds.ub1) * previous->span_biggest; 735 bbounds.ub0 += add; 736 } else if (old_ub1 < bbounds.ub1) { 737 // still need to move upper bound: 738 T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest; 739 bbounds.ub0 -= sub; 740 } 741 } else { 742 KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq); 743 if (old_lb1 < bbounds.lb1) { 744 KMP_ASSERT(old_lb1 < 0); 745 T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest; 746 bbounds.lb0 -= sub; 747 } else if (old_lb1 > bbounds.lb1) { 748 T add = (old_lb1 - bbounds.lb1) * previous->span_biggest; 749 bbounds.lb0 += add; 750 } 751 752 if (old_ub1 > bbounds.ub1) { 753 KMP_ASSERT(old_ub1 > 0); 754 T add = (old_ub1 - bbounds.ub1) * previous->span_smallest; 755 bbounds.ub0 += add; 756 } else if (old_ub1 < bbounds.ub1) { 757 T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest; 758 bbounds.ub0 -= sub; 759 } 760 } 761 } 762 } 763 764 // Do all processing for one canonicalized loop in the nest 765 // (assuming that outer loops already were processed): 766 template <typename T> 767 kmp_loop_nest_iv_t kmp_process_one_loop_XX( 768 /* in/out*/ bounds_info_internalXX_template<T> *bounds, 769 /*in/out*/ bounds_info_internal_t *bounds_nest) { 770 771 kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); 772 kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); 773 return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b)); 774 } 775 776 // Non-rectangular loop nest, canonicalized to use <= or >=. 777 // Process loop nest to have a parallelepiped shape, 778 // calculate biggest spans for IV's on all levels and calculate overall trip 779 // count. "bounds_nest" has to be allocated per thread. 780 // Returns overall trip count (for adjusted space). 781 kmp_loop_nest_iv_t kmp_process_loop_nest( 782 /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) { 783 784 kmp_loop_nest_iv_t total = 1; 785 786 for (kmp_index_t ind = 0; ind < n; ++ind) { 787 auto bounds = &(bounds_nest[ind]); 788 kmp_loop_nest_iv_t trip_count = 0; 789 790 switch (bounds->b.loop_type) { 791 case loop_type_t::loop_type_int32: 792 trip_count = kmp_process_one_loop_XX<kmp_int32>( 793 /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds), 794 /*in/out*/ bounds_nest); 795 break; 796 case loop_type_t::loop_type_uint32: 797 trip_count = kmp_process_one_loop_XX<kmp_uint32>( 798 /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds), 799 /*in/out*/ bounds_nest); 800 break; 801 case loop_type_t::loop_type_int64: 802 trip_count = kmp_process_one_loop_XX<kmp_int64>( 803 /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds), 804 /*in/out*/ bounds_nest); 805 break; 806 case loop_type_t::loop_type_uint64: 807 trip_count = kmp_process_one_loop_XX<kmp_uint64>( 808 /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds), 809 /*in/out*/ bounds_nest); 810 break; 811 default: 812 KMP_ASSERT(false); 813 } 814 total *= trip_count; 815 } 816 817 return total; 818 } 819 820 //----------Calculate iterations (in the original or updated space)----------- 821 822 // Calculate number of iterations in original or updated space resulting in 823 // original_ivs[ind] (only on this level, non-negative) 824 // (not counting initial iteration) 825 template <typename T> 826 kmp_loop_nest_iv_t 827 kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds, 828 const kmp_point_t original_ivs, 829 kmp_index_t ind) { 830 831 kmp_loop_nest_iv_t iterations = 0; 832 833 if (bounds->comparison == comparison_t::comp_less_or_eq) { 834 iterations = 835 (static_cast<T>(original_ivs[ind]) - bounds->lb0 - 836 bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) / 837 __kmp_abs(bounds->step); 838 } else { 839 KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq); 840 iterations = (bounds->lb0 + 841 bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) - 842 static_cast<T>(original_ivs[ind])) / 843 __kmp_abs(bounds->step); 844 } 845 846 return iterations; 847 } 848 849 // Calculate number of iterations in the original or updated space resulting in 850 // original_ivs[ind] (only on this level, non-negative) 851 kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds, 852 const kmp_point_t original_ivs, 853 kmp_index_t ind) { 854 855 switch (bounds->loop_type) { 856 case loop_type_t::loop_type_int32: 857 return kmp_calc_number_of_iterations_XX<kmp_int32>( 858 (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind); 859 break; 860 case loop_type_t::loop_type_uint32: 861 return kmp_calc_number_of_iterations_XX<kmp_uint32>( 862 (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind); 863 break; 864 case loop_type_t::loop_type_int64: 865 return kmp_calc_number_of_iterations_XX<kmp_int64>( 866 (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind); 867 break; 868 case loop_type_t::loop_type_uint64: 869 return kmp_calc_number_of_iterations_XX<kmp_uint64>( 870 (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind); 871 break; 872 default: 873 KMP_ASSERT(false); 874 return 0; 875 } 876 } 877 878 //----------Calculate new iv corresponding to original ivs-------------------- 879 880 // We got a point in the original loop nest. 881 // Take updated bounds and calculate what new_iv will correspond to this point. 882 // When we are getting original IVs from new_iv, we have to adjust to fit into 883 // original loops bounds. Getting new_iv for the adjusted original IVs will help 884 // with making more chunks non-empty. 885 kmp_loop_nest_iv_t 886 kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest, 887 const kmp_point_t original_ivs, 888 kmp_index_t n) { 889 890 kmp_loop_nest_iv_t new_iv = 0; 891 892 for (kmp_index_t ind = 0; ind < n; ++ind) { 893 auto bounds = &(bounds_nest[ind].b); 894 895 new_iv = new_iv * bounds->trip_count + 896 kmp_calc_number_of_iterations(bounds, original_ivs, ind); 897 } 898 899 return new_iv; 900 } 901 902 //----------Calculate original ivs for provided iterations-------------------- 903 904 // Calculate original IVs for provided iterations, assuming iterations are 905 // calculated in the original space. 906 // Loop nest is in canonical form (with <= / >=). 907 bool kmp_calc_original_ivs_from_iterations( 908 const bounds_info_t *original_bounds_nest, kmp_index_t n, 909 /*in/out*/ kmp_point_t original_ivs, 910 /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) { 911 912 kmp_index_t lengthened_ind = n; 913 914 for (; ind < n;) { 915 auto bounds = &(original_bounds_nest[ind]); 916 bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations, 917 ind, (lengthened_ind < ind), true); 918 919 if (!good) { 920 // The calculated iv value is too big (or too small for >=): 921 if (ind == 0) { 922 // Space is empty: 923 return false; 924 } else { 925 // Go to next iteration on the outer loop: 926 --ind; 927 ++iterations[ind]; 928 lengthened_ind = ind; 929 for (kmp_index_t i = ind + 1; i < n; ++i) { 930 iterations[i] = 0; 931 } 932 continue; 933 } 934 } 935 ++ind; 936 } 937 938 return true; 939 } 940 941 //----------Calculate original ivs for the beginning of the loop nest--------- 942 943 // Calculate IVs for the beginning of the loop nest. 944 // Note: lower bounds of all loops may not work - 945 // if on some of the iterations of the outer loops inner loops are empty. 946 // Loop nest is in canonical form (with <= / >=). 947 bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest, 948 kmp_index_t n, 949 /*out*/ kmp_point_t original_ivs) { 950 951 // Iterations in the original space, multiplied by step: 952 CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); 953 for (kmp_index_t ind = n; ind > 0;) { 954 --ind; 955 iterations[ind] = 0; 956 } 957 958 // Now calculate the point: 959 bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n, 960 /*in/out*/ original_ivs, 961 /*in/out*/ iterations, 0); 962 return b; 963 } 964 965 //----------Calculate next point in the original loop space------------------- 966 967 // From current set of original IVs calculate next point. 968 // Return false if there is no next point in the loop bounds. 969 bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest, 970 kmp_index_t n, const kmp_point_t original_ivs, 971 /*out*/ kmp_point_t next_original_ivs) { 972 // Iterations in the original space, multiplied by step (so can be negative): 973 CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); 974 // First, calc corresponding iteration in every original loop: 975 for (kmp_index_t ind = 0; ind < n; ++ind) { 976 auto bounds = &(original_bounds_nest[ind]); 977 iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind); 978 } 979 980 for (kmp_index_t ind = 0; ind < n; ++ind) { 981 next_original_ivs[ind] = original_ivs[ind]; 982 } 983 984 // Next add one step to the iterations on the inner-most level, and see if we 985 // need to move up the nest: 986 kmp_index_t ind = n - 1; 987 ++iterations[ind]; 988 989 bool b = kmp_calc_original_ivs_from_iterations( 990 original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind); 991 992 return b; 993 } 994 995 //----------Calculate chunk end in the original loop space-------------------- 996 997 // For one level calculate old induction variable corresponding to overall 998 // new_iv for the chunk end. 999 // Return true if it fits into upper bound on this level 1000 // (if not, we need to re-calculate) 1001 template <typename T> 1002 bool kmp_calc_one_iv_for_chunk_end_XX( 1003 const bounds_infoXX_template<T> *bounds, 1004 const bounds_infoXX_template<T> *updated_bounds, 1005 /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations, 1006 kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start, 1007 const kmp_point_t original_ivs_start) { 1008 1009 // typedef std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64> 1010 // big_span_t; 1011 1012 // OMPTODO: is it good enough, or do we need ST or do we need big_span_t? 1013 T temp = 0; 1014 1015 T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); 1016 1017 if (start_with_lower_bound) { 1018 // we moved to the next iteration on one of outer loops, may as well use 1019 // the lower bound here: 1020 temp = bounds->lb0 + bounds->lb1 * outer_iv; 1021 } else { 1022 // Start in expanded space, but: 1023 // - we need to hit original space lower bound, so need to account for 1024 // that 1025 // - we have to go into original space, even if that means adding more 1026 // iterations than was planned 1027 // - we have to go past (or equal to) previous point (which is the chunk 1028 // starting point) 1029 1030 auto iteration = iterations[ind]; 1031 1032 auto step = bounds->step; 1033 1034 // In case of >= it's negative: 1035 auto accountForStep = 1036 ((bounds->lb0 + bounds->lb1 * outer_iv) - 1037 (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) % 1038 step; 1039 1040 temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv + 1041 accountForStep + iteration * step; 1042 1043 if (((bounds->comparison == comparison_t::comp_less_or_eq) && 1044 (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) || 1045 ((bounds->comparison == comparison_t::comp_greater_or_eq) && 1046 (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) { 1047 // Too small (or too big), didn't reach the original lower bound. Use 1048 // heuristic: 1049 temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step; 1050 } 1051 1052 if (compare_with_start) { 1053 1054 T start = static_cast<T>(original_ivs_start[ind]); 1055 1056 temp = kmp_fix_iv(bounds->loop_iv_type, temp); 1057 1058 // On all previous levels start of the chunk is same as the end, need to 1059 // be really careful here: 1060 if (((bounds->comparison == comparison_t::comp_less_or_eq) && 1061 (temp < start)) || 1062 ((bounds->comparison == comparison_t::comp_greater_or_eq) && 1063 (temp > start))) { 1064 // End of the chunk can't be smaller (for >= bigger) than it's start. 1065 // Use heuristic: 1066 temp = start + iteration / 4 * step; 1067 } 1068 } 1069 } 1070 1071 original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp); 1072 1073 if (((bounds->comparison == comparison_t::comp_less_or_eq) && 1074 (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) || 1075 ((bounds->comparison == comparison_t::comp_greater_or_eq) && 1076 (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) { 1077 // Too big (or too small for >=). 1078 return false; 1079 } 1080 1081 return true; 1082 } 1083 1084 // For one level calculate old induction variable corresponding to overall 1085 // new_iv for the chunk end. 1086 bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds, 1087 const bounds_info_t *updated_bounds, 1088 /*in/out*/ kmp_point_t original_ivs, 1089 const kmp_iterations_t iterations, 1090 kmp_index_t ind, bool start_with_lower_bound, 1091 bool compare_with_start, 1092 const kmp_point_t original_ivs_start) { 1093 1094 switch (bounds->loop_type) { 1095 case loop_type_t::loop_type_int32: 1096 return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>( 1097 (bounds_infoXX_template<kmp_int32> *)(bounds), 1098 (bounds_infoXX_template<kmp_int32> *)(updated_bounds), 1099 /*in/out*/ 1100 original_ivs, iterations, ind, start_with_lower_bound, 1101 compare_with_start, original_ivs_start); 1102 break; 1103 case loop_type_t::loop_type_uint32: 1104 return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>( 1105 (bounds_infoXX_template<kmp_uint32> *)(bounds), 1106 (bounds_infoXX_template<kmp_uint32> *)(updated_bounds), 1107 /*in/out*/ 1108 original_ivs, iterations, ind, start_with_lower_bound, 1109 compare_with_start, original_ivs_start); 1110 break; 1111 case loop_type_t::loop_type_int64: 1112 return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>( 1113 (bounds_infoXX_template<kmp_int64> *)(bounds), 1114 (bounds_infoXX_template<kmp_int64> *)(updated_bounds), 1115 /*in/out*/ 1116 original_ivs, iterations, ind, start_with_lower_bound, 1117 compare_with_start, original_ivs_start); 1118 break; 1119 case loop_type_t::loop_type_uint64: 1120 return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>( 1121 (bounds_infoXX_template<kmp_uint64> *)(bounds), 1122 (bounds_infoXX_template<kmp_uint64> *)(updated_bounds), 1123 /*in/out*/ 1124 original_ivs, iterations, ind, start_with_lower_bound, 1125 compare_with_start, original_ivs_start); 1126 break; 1127 default: 1128 KMP_ASSERT(false); 1129 return false; 1130 } 1131 } 1132 1133 // Calculate old induction variables corresponding to overall new_iv for the 1134 // chunk end. If due to space extension we are getting old IVs outside of the 1135 // boundaries, bring them into the boundaries. Need to do this in the runtime, 1136 // esp. on the lower bounds side. When getting result need to make sure that the 1137 // new chunk starts at next position to old chunk, not overlaps with it (this is 1138 // done elsewhere), and need to make sure end of the chunk is further than the 1139 // beginning of the chunk. We don't need an exact ending point here, just 1140 // something more-or-less close to the desired chunk length, bigger is fine 1141 // (smaller would be fine, but we risk going into infinite loop, so do smaller 1142 // only at the very end of the space). result: false if could not find the 1143 // ending point in the original loop space. In this case the caller can use 1144 // original upper bounds as the end of the chunk. Chunk won't be empty, because 1145 // it'll have at least the starting point, which is by construction in the 1146 // original space. 1147 bool kmp_calc_original_ivs_for_chunk_end( 1148 const bounds_info_t *original_bounds_nest, kmp_index_t n, 1149 const bounds_info_internal_t *updated_bounds_nest, 1150 const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv, 1151 /*out*/ kmp_point_t original_ivs) { 1152 1153 // Iterations in the expanded space: 1154 CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); 1155 // First, calc corresponding iteration in every modified loop: 1156 for (kmp_index_t ind = n; ind > 0;) { 1157 --ind; 1158 auto &updated_bounds = updated_bounds_nest[ind]; 1159 1160 // should be optimized to OPDIVREM: 1161 auto new_ind = new_iv / updated_bounds.b.trip_count; 1162 auto iteration = new_iv % updated_bounds.b.trip_count; 1163 1164 new_iv = new_ind; 1165 iterations[ind] = iteration; 1166 } 1167 KMP_DEBUG_ASSERT(new_iv == 0); 1168 1169 kmp_index_t lengthened_ind = n; 1170 kmp_index_t equal_ind = -1; 1171 1172 // Next calculate the point, but in original loop nest. 1173 for (kmp_index_t ind = 0; ind < n;) { 1174 auto bounds = &(original_bounds_nest[ind]); 1175 auto updated_bounds = &(updated_bounds_nest[ind].b); 1176 1177 bool good = kmp_calc_one_iv_for_chunk_end( 1178 bounds, updated_bounds, 1179 /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind), 1180 (equal_ind >= ind - 1), original_ivs_start); 1181 1182 if (!good) { 1183 // Too big (or too small for >=). 1184 if (ind == 0) { 1185 // Need to reduce to the end. 1186 return false; 1187 } else { 1188 // Go to next iteration on outer loop: 1189 --ind; 1190 ++(iterations[ind]); 1191 lengthened_ind = ind; 1192 if (equal_ind >= lengthened_ind) { 1193 // We've changed the number of iterations here, 1194 // can't be same anymore: 1195 equal_ind = lengthened_ind - 1; 1196 } 1197 for (kmp_index_t i = ind + 1; i < n; ++i) { 1198 iterations[i] = 0; 1199 } 1200 continue; 1201 } 1202 } 1203 1204 if ((equal_ind == ind - 1) && 1205 (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind], 1206 original_ivs_start[ind]))) { 1207 equal_ind = ind; 1208 } else if ((equal_ind > ind - 1) && 1209 !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind], 1210 original_ivs_start[ind]))) { 1211 equal_ind = ind - 1; 1212 } 1213 ++ind; 1214 } 1215 1216 return true; 1217 } 1218 1219 //----------Calculate upper bounds for the last chunk------------------------- 1220 1221 // Calculate one upper bound for the end. 1222 template <typename T> 1223 void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds, 1224 /*in/out*/ kmp_point_t original_ivs, 1225 kmp_index_t ind) { 1226 1227 T temp = bounds->ub0 + 1228 bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]); 1229 1230 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); 1231 } 1232 1233 void kmp_calc_one_iv_end(const bounds_info_t *bounds, 1234 /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) { 1235 1236 switch (bounds->loop_type) { 1237 default: 1238 KMP_ASSERT(false); 1239 break; 1240 case loop_type_t::loop_type_int32: 1241 kmp_calc_one_iv_end_XX<kmp_int32>( 1242 (bounds_infoXX_template<kmp_int32> *)(bounds), 1243 /*in/out*/ original_ivs, ind); 1244 break; 1245 case loop_type_t::loop_type_uint32: 1246 kmp_calc_one_iv_end_XX<kmp_uint32>( 1247 (bounds_infoXX_template<kmp_uint32> *)(bounds), 1248 /*in/out*/ original_ivs, ind); 1249 break; 1250 case loop_type_t::loop_type_int64: 1251 kmp_calc_one_iv_end_XX<kmp_int64>( 1252 (bounds_infoXX_template<kmp_int64> *)(bounds), 1253 /*in/out*/ original_ivs, ind); 1254 break; 1255 case loop_type_t::loop_type_uint64: 1256 kmp_calc_one_iv_end_XX<kmp_uint64>( 1257 (bounds_infoXX_template<kmp_uint64> *)(bounds), 1258 /*in/out*/ original_ivs, ind); 1259 break; 1260 } 1261 } 1262 1263 // Calculate upper bounds for the last loop iteration. Just use original upper 1264 // bounds (adjusted when canonicalized to use <= / >=). No need to check that 1265 // this point is in the original space (it's likely not) 1266 void kmp_calc_original_ivs_for_end( 1267 const bounds_info_t *const original_bounds_nest, kmp_index_t n, 1268 /*out*/ kmp_point_t original_ivs) { 1269 for (kmp_index_t ind = 0; ind < n; ++ind) { 1270 auto bounds = &(original_bounds_nest[ind]); 1271 kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind); 1272 } 1273 } 1274 1275 /************************************************************************** 1276 * Identify nested loop structure - loops come in the canonical form 1277 * Lower triangle matrix: i = 0; i <= N; i++ {0,0}:{N,0} 1278 * j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1} 1279 * Upper Triangle matrix 1280 * i = 0; i <= N; i++ {0,0}:{N,0} 1281 * j = 0+1*i; j <= N; j++ {0,1}:{N,0} 1282 * ************************************************************************/ 1283 nested_loop_type_t 1284 kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest, 1285 /*in*/ kmp_index_t n) { 1286 // only 2-level nested loops are supported 1287 if (n != 2) { 1288 return nested_loop_type_unkown; 1289 } 1290 // loops must be canonical 1291 KMP_ASSERT( 1292 (original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) && 1293 (original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq)); 1294 // check outer loop bounds: for triangular need to be {0,0}:{N,0} 1295 kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, 1296 original_bounds_nest[0].lb0_u64); 1297 kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, 1298 original_bounds_nest[0].ub0_u64); 1299 kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, 1300 original_bounds_nest[0].lb1_u64); 1301 kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, 1302 original_bounds_nest[0].ub1_u64); 1303 if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) { 1304 return nested_loop_type_unkown; 1305 } 1306 // check inner bounds to determine triangle type 1307 kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, 1308 original_bounds_nest[1].lb0_u64); 1309 kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, 1310 original_bounds_nest[1].ub0_u64); 1311 kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, 1312 original_bounds_nest[1].lb1_u64); 1313 kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, 1314 original_bounds_nest[1].ub1_u64); 1315 // lower triangle loop inner bounds need to be {0,0}:{0/-1,1} 1316 if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 && 1317 (inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) { 1318 return nested_loop_type_lower_triangular_matrix; 1319 } 1320 // upper triangle loop inner bounds need to be {0,1}:{N,0} 1321 if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 && 1322 inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) { 1323 return nested_loop_type_upper_triangular_matrix; 1324 } 1325 return nested_loop_type_unkown; 1326 } 1327 1328 /************************************************************************** 1329 * SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf 1330 * Start point is x so the result is always > sqrt(x) 1331 * The method has uniform convergence, PRECISION is set to 0.1 1332 * ************************************************************************/ 1333 #define level_of_precision 0.1 1334 double sqrt_newton_approx(/*in*/ kmp_uint64 x) { 1335 double sqrt_old = 0.; 1336 double sqrt_new = (double)x; 1337 do { 1338 sqrt_old = sqrt_new; 1339 sqrt_new = (sqrt_old + x / sqrt_old) / 2; 1340 } while ((sqrt_old - sqrt_new) > level_of_precision); 1341 return sqrt_new; 1342 } 1343 1344 /************************************************************************** 1345 * Handle lower triangle matrix in the canonical form 1346 * i = 0; i <= N; i++ {0,0}:{N,0} 1347 * j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1} 1348 * ************************************************************************/ 1349 void kmp_handle_lower_triangle_matrix( 1350 /*in*/ kmp_uint32 nth, 1351 /*in*/ kmp_uint32 tid, 1352 /*in */ kmp_index_t n, 1353 /*in/out*/ bounds_info_t *original_bounds_nest, 1354 /*out*/ bounds_info_t *chunk_bounds_nest) { 1355 1356 // transfer loop types from the original loop to the chunks 1357 for (kmp_index_t i = 0; i < n; ++i) { 1358 chunk_bounds_nest[i] = original_bounds_nest[i]; 1359 } 1360 // cleanup iv variables 1361 kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, 1362 original_bounds_nest[0].ub0_u64); 1363 kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, 1364 original_bounds_nest[0].lb0_u64); 1365 kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, 1366 original_bounds_nest[1].ub0_u64); 1367 // calculate the chunk's lower and upper bounds 1368 // the total number of iterations in the loop is the sum of the arithmetic 1369 // progression from the outer lower to outer upper bound (inclusive since the 1370 // loop is canonical) note that less_than inner loops (inner_ub0 = -1) 1371 // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0 1372 // + 1) -> N - 1 1373 kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0; 1374 kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2; 1375 // the current thread's number of iterations: 1376 // each thread gets an equal number of iterations: total number of iterations 1377 // divided by the number of threads plus, if there's a remainder, 1378 // the first threads with the number up to the remainder get an additional 1379 // iteration each to cover it 1380 kmp_uint64 iter_current = 1381 iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0); 1382 // cumulative number of iterations executed by all the previous threads: 1383 // threads with the tid below the remainder will have (iter_total/nth+1) 1384 // elements, and so will all threads before them so the cumulative number of 1385 // iterations executed by the all previous will be the current thread's number 1386 // of iterations multiplied by the number of previous threads which is equal 1387 // to the current thread's tid; threads with the number equal or above the 1388 // remainder will have (iter_total/nth) elements so the cumulative number of 1389 // iterations previously executed is its number of iterations multipled by the 1390 // number of previous threads which is again equal to the current thread's tid 1391 // PLUS all the remainder iterations that will have been executed by the 1392 // previous threads 1393 kmp_uint64 iter_before_current = 1394 tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth)); 1395 // cumulative number of iterations executed with the current thread is 1396 // the cumulative number executed before it plus its own 1397 kmp_uint64 iter_with_current = iter_before_current + iter_current; 1398 // calculate the outer loop lower bound (lbo) which is the max outer iv value 1399 // that gives the number of iterations that is equal or just below the total 1400 // number of iterations executed by the previous threads, for less_than 1401 // (1-based) inner loops (inner_ub0 == -1) it will be i.e. 1402 // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0 1403 // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be: 1404 // i.e. lbo*(lbo+1)/2<=iter_before_current => 1405 // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily 1406 // using a parameter to control the equation sign 1407 kmp_int64 inner_adjustment = 1 + 2 * inner_ub0; 1408 kmp_uint64 lower_bound_outer = 1409 (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment + 1410 8 * iter_before_current) + 1411 inner_adjustment) / 1412 2 - 1413 inner_adjustment; 1414 // calculate the inner loop lower bound which is the remaining number of 1415 // iterations required to hit the total number of iterations executed by the 1416 // previous threads giving the starting point of this thread 1417 kmp_uint64 lower_bound_inner = 1418 iter_before_current - 1419 ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2; 1420 // calculate the outer loop upper bound using the same approach as for the 1421 // inner bound except using the total number of iterations executed with the 1422 // current thread 1423 kmp_uint64 upper_bound_outer = 1424 (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment + 1425 8 * iter_with_current) + 1426 inner_adjustment) / 1427 2 - 1428 inner_adjustment; 1429 // calculate the inner loop upper bound which is the remaining number of 1430 // iterations required to hit the total number of iterations executed after 1431 // the current thread giving the starting point of the next thread 1432 kmp_uint64 upper_bound_inner = 1433 iter_with_current - 1434 ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2; 1435 // adjust the upper bounds down by 1 element to point at the last iteration of 1436 // the current thread the first iteration of the next thread 1437 if (upper_bound_inner == 0) { 1438 // {n,0} => {n-1,n-1} 1439 upper_bound_outer -= 1; 1440 upper_bound_inner = upper_bound_outer; 1441 } else { 1442 // {n,m} => {n,m-1} (m!=0) 1443 upper_bound_inner -= 1; 1444 } 1445 1446 // assign the values, zeroing out lb1 and ub1 values since the iteration space 1447 // is now one-dimensional 1448 chunk_bounds_nest[0].lb0_u64 = lower_bound_outer; 1449 chunk_bounds_nest[1].lb0_u64 = lower_bound_inner; 1450 chunk_bounds_nest[0].ub0_u64 = upper_bound_outer; 1451 chunk_bounds_nest[1].ub0_u64 = upper_bound_inner; 1452 chunk_bounds_nest[0].lb1_u64 = 0; 1453 chunk_bounds_nest[0].ub1_u64 = 0; 1454 chunk_bounds_nest[1].lb1_u64 = 0; 1455 chunk_bounds_nest[1].ub1_u64 = 0; 1456 1457 #if 0 1458 printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n", 1459 tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64, 1460 chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total); 1461 #endif 1462 } 1463 1464 /************************************************************************** 1465 * Handle upper triangle matrix in the canonical form 1466 * i = 0; i <= N; i++ {0,0}:{N,0} 1467 * j = 0+1*i; j <= N; j++ {0,1}:{N,0} 1468 * ************************************************************************/ 1469 void kmp_handle_upper_triangle_matrix( 1470 /*in*/ kmp_uint32 nth, 1471 /*in*/ kmp_uint32 tid, 1472 /*in */ kmp_index_t n, 1473 /*in/out*/ bounds_info_t *original_bounds_nest, 1474 /*out*/ bounds_info_t *chunk_bounds_nest) { 1475 1476 // transfer loop types from the original loop to the chunks 1477 for (kmp_index_t i = 0; i < n; ++i) { 1478 chunk_bounds_nest[i] = original_bounds_nest[i]; 1479 } 1480 // cleanup iv variables 1481 kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, 1482 original_bounds_nest[0].ub0_u64); 1483 kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, 1484 original_bounds_nest[0].lb0_u64); 1485 [[maybe_unused]] kmp_uint64 inner_ub0 = kmp_fix_iv( 1486 original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub0_u64); 1487 // calculate the chunk's lower and upper bounds 1488 // the total number of iterations in the loop is the sum of the arithmetic 1489 // progression from the outer lower to outer upper bound (inclusive since the 1490 // loop is canonical) note that less_than inner loops (inner_ub0 = -1) 1491 // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0 1492 // + 1) -> N - 1 1493 kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1); 1494 kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2; 1495 // the current thread's number of iterations: 1496 // each thread gets an equal number of iterations: total number of iterations 1497 // divided by the number of threads plus, if there's a remainder, 1498 // the first threads with the number up to the remainder get an additional 1499 // iteration each to cover it 1500 kmp_uint64 iter_current = 1501 iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0); 1502 // cumulative number of iterations executed by all the previous threads: 1503 // threads with the tid below the remainder will have (iter_total/nth+1) 1504 // elements, and so will all threads before them so the cumulative number of 1505 // iterations executed by the all previous will be the current thread's number 1506 // of iterations multiplied by the number of previous threads which is equal 1507 // to the current thread's tid; threads with the number equal or above the 1508 // remainder will have (iter_total/nth) elements so the cumulative number of 1509 // iterations previously executed is its number of iterations multipled by the 1510 // number of previous threads which is again equal to the current thread's tid 1511 // PLUS all the remainder iterations that will have been executed by the 1512 // previous threads 1513 kmp_uint64 iter_before_current = 1514 tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth)); 1515 // cumulative number of iterations executed with the current thread is 1516 // the cumulative number executed before it plus its own 1517 kmp_uint64 iter_with_current = iter_before_current + iter_current; 1518 // calculate the outer loop lower bound (lbo) which is the max outer iv value 1519 // that gives the number of iterations that is equal or just below the total 1520 // number of iterations executed by the previous threads: 1521 // lbo*(lbo+1)/2<=iter_before_current => 1522 // lbo^2+lbo-2*iter_before_current<=0 1523 kmp_uint64 lower_bound_outer = 1524 (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1; 1525 // calculate the inner loop lower bound which is the remaining number of 1526 // iterations required to hit the total number of iterations executed by the 1527 // previous threads giving the starting point of this thread 1528 kmp_uint64 lower_bound_inner = 1529 iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2; 1530 // calculate the outer loop upper bound using the same approach as for the 1531 // inner bound except using the total number of iterations executed with the 1532 // current thread 1533 kmp_uint64 upper_bound_outer = 1534 (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1; 1535 // calculate the inner loop upper bound which is the remaining number of 1536 // iterations required to hit the total number of iterations executed after 1537 // the current thread giving the starting point of the next thread 1538 kmp_uint64 upper_bound_inner = 1539 iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2; 1540 // adjust the upper bounds down by 1 element to point at the last iteration of 1541 // the current thread the first iteration of the next thread 1542 if (upper_bound_inner == 0) { 1543 // {n,0} => {n-1,n-1} 1544 upper_bound_outer -= 1; 1545 upper_bound_inner = upper_bound_outer; 1546 } else { 1547 // {n,m} => {n,m-1} (m!=0) 1548 upper_bound_inner -= 1; 1549 } 1550 1551 // assign the values, zeroing out lb1 and ub1 values since the iteration space 1552 // is now one-dimensional 1553 chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer; 1554 chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner; 1555 chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer; 1556 chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner; 1557 chunk_bounds_nest[0].lb1_u64 = 0; 1558 chunk_bounds_nest[0].ub1_u64 = 0; 1559 chunk_bounds_nest[1].lb1_u64 = 0; 1560 chunk_bounds_nest[1].ub1_u64 = 0; 1561 1562 #if 0 1563 printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n", 1564 tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64, 1565 chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total); 1566 #endif 1567 } 1568 //----------Init API for non-rectangular loops-------------------------------- 1569 1570 // Init API for collapsed loops (static, no chunks defined). 1571 // "bounds_nest" has to be allocated per thread. 1572 // API will modify original bounds_nest array to bring it to a canonical form 1573 // (only <= and >=, no !=, <, >). If the original loop nest was already in a 1574 // canonical form there will be no changes to bounds in bounds_nest array 1575 // (only trip counts will be calculated). Internally API will expand the space 1576 // to parallelogram/parallelepiped, calculate total, calculate bounds for the 1577 // chunks in terms of the new IV, re-calc them in terms of old IVs (especially 1578 // important on the left side, to hit the lower bounds and not step over), and 1579 // pick the correct chunk for this thread (so it will calculate chunks up to the 1580 // needed one). It could be optimized to calculate just this chunk, potentially 1581 // a bit less well distributed among threads. It is designed to make sure that 1582 // threads will receive predictable chunks, deterministically (so that next nest 1583 // of loops with similar characteristics will get exactly same chunks on same 1584 // threads). 1585 // Current contract: chunk_bounds_nest has only lb0 and ub0, 1586 // lb1 and ub1 are set to 0 and can be ignored. (This may change in the future). 1587 extern "C" kmp_int32 1588 __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, 1589 /*in/out*/ bounds_info_t *original_bounds_nest, 1590 /*out*/ bounds_info_t *chunk_bounds_nest, 1591 kmp_index_t n, /*out*/ kmp_int32 *plastiter) { 1592 1593 KMP_DEBUG_ASSERT(plastiter && original_bounds_nest); 1594 KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid)); 1595 1596 if (__kmp_env_consistency_check) { 1597 __kmp_push_workshare(gtid, ct_pdo, loc); 1598 } 1599 1600 kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n); 1601 1602 CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n); 1603 1604 for (kmp_index_t i = 0; i < n; ++i) { 1605 updated_bounds_nest[i].b = original_bounds_nest[i]; 1606 } 1607 1608 kmp_loop_nest_iv_t total = 1609 kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n); 1610 1611 if (plastiter != NULL) { 1612 *plastiter = FALSE; 1613 } 1614 1615 if (total == 0) { 1616 // Loop won't execute: 1617 return FALSE; 1618 } 1619 1620 // OMPTODO: DISTRIBUTE is not supported yet 1621 __kmp_assert_valid_gtid(gtid); 1622 kmp_uint32 tid = __kmp_tid_from_gtid(gtid); 1623 1624 kmp_info_t *th = __kmp_threads[gtid]; 1625 kmp_team_t *team = th->th.th_team; 1626 kmp_uint32 nth = team->t.t_nproc; // Number of threads 1627 1628 KMP_DEBUG_ASSERT(tid < nth); 1629 1630 // Handle special cases 1631 nested_loop_type_t loop_type = 1632 kmp_identify_nested_loop_structure(original_bounds_nest, n); 1633 if (loop_type == nested_loop_type_lower_triangular_matrix) { 1634 kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest, 1635 chunk_bounds_nest); 1636 return TRUE; 1637 } else if (loop_type == nested_loop_type_upper_triangular_matrix) { 1638 kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest, 1639 chunk_bounds_nest); 1640 return TRUE; 1641 } 1642 1643 CollapseAllocator<kmp_uint64> original_ivs_start(n); 1644 1645 if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n, 1646 /*out*/ original_ivs_start)) { 1647 // Loop won't execute: 1648 return FALSE; 1649 } 1650 1651 // Not doing this optimization for one thread: 1652 // (1) more to test 1653 // (2) without it current contract that chunk_bounds_nest has only lb0 and 1654 // ub0, lb1 and ub1 are set to 0 and can be ignored. 1655 // if (nth == 1) { 1656 // // One thread: 1657 // // Copy all info from original_bounds_nest, it'll be good enough. 1658 1659 // for (kmp_index_t i = 0; i < n; ++i) { 1660 // chunk_bounds_nest[i] = original_bounds_nest[i]; 1661 // } 1662 1663 // if (plastiter != NULL) { 1664 // *plastiter = TRUE; 1665 // } 1666 // return TRUE; 1667 //} 1668 1669 kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs( 1670 updated_bounds_nest, original_ivs_start, n); 1671 1672 bool last_iter = false; 1673 1674 for (; nth > 0;) { 1675 // We could calculate chunk size once, but this is to compensate that the 1676 // original space is not parallelepiped and some threads can be left 1677 // without work: 1678 KMP_DEBUG_ASSERT(total >= new_iv); 1679 1680 kmp_loop_nest_iv_t total_left = total - new_iv; 1681 kmp_loop_nest_iv_t chunk_size = total_left / nth; 1682 kmp_loop_nest_iv_t remainder = total_left % nth; 1683 1684 kmp_loop_nest_iv_t curr_chunk_size = chunk_size; 1685 1686 if (remainder > 0) { 1687 ++curr_chunk_size; 1688 --remainder; 1689 } 1690 1691 #if defined(KMP_DEBUG) 1692 kmp_loop_nest_iv_t new_iv_for_start = new_iv; 1693 #endif 1694 1695 if (curr_chunk_size > 1) { 1696 new_iv += curr_chunk_size - 1; 1697 } 1698 1699 CollapseAllocator<kmp_uint64> original_ivs_end(n); 1700 if ((nth == 1) || (new_iv >= total - 1)) { 1701 // Do this one till the end - just in case we miscalculated 1702 // and either too much is left to process or new_iv is a bit too big: 1703 kmp_calc_original_ivs_for_end(original_bounds_nest, n, 1704 /*out*/ original_ivs_end); 1705 1706 last_iter = true; 1707 } else { 1708 // Note: here we make sure it's past (or equal to) the previous point. 1709 if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n, 1710 updated_bounds_nest, 1711 original_ivs_start, new_iv, 1712 /*out*/ original_ivs_end)) { 1713 // We could not find the ending point, use the original upper bounds: 1714 kmp_calc_original_ivs_for_end(original_bounds_nest, n, 1715 /*out*/ original_ivs_end); 1716 1717 last_iter = true; 1718 } 1719 } 1720 1721 #if defined(KMP_DEBUG) 1722 auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs( 1723 updated_bounds_nest, original_ivs_end, n); 1724 KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start); 1725 #endif 1726 1727 if (last_iter && (tid != 0)) { 1728 // We are done, this was last chunk, but no chunk for current thread was 1729 // found: 1730 return FALSE; 1731 } 1732 1733 if (tid == 0) { 1734 // We found the chunk for this thread, now we need to check if it's the 1735 // last chunk or not: 1736 1737 CollapseAllocator<kmp_uint64> original_ivs_next_start(n); 1738 if (last_iter || 1739 !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end, 1740 /*out*/ original_ivs_next_start)) { 1741 // no more loop iterations left to process, 1742 // this means that currently found chunk is the last chunk: 1743 if (plastiter != NULL) { 1744 *plastiter = TRUE; 1745 } 1746 } 1747 1748 // Fill in chunk bounds: 1749 for (kmp_index_t i = 0; i < n; ++i) { 1750 chunk_bounds_nest[i] = 1751 original_bounds_nest[i]; // To fill in types, etc. - optional 1752 chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i]; 1753 chunk_bounds_nest[i].lb1_u64 = 0; 1754 1755 chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i]; 1756 chunk_bounds_nest[i].ub1_u64 = 0; 1757 } 1758 1759 return TRUE; 1760 } 1761 1762 --tid; 1763 --nth; 1764 1765 bool next_chunk = kmp_calc_next_original_ivs( 1766 original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start); 1767 if (!next_chunk) { 1768 // no more loop iterations to process, 1769 // the prevoius chunk was the last chunk 1770 break; 1771 } 1772 1773 // original_ivs_start is next to previous chunk original_ivs_end, 1774 // we need to start new chunk here, so chunks will be one after another 1775 // without any gap or overlap: 1776 new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest, 1777 original_ivs_start, n); 1778 } 1779 1780 return FALSE; 1781 } 1782