1 /* 2 * kmp_collapse.cpp -- loop collapse feature 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_error.h" 15 #include "kmp_i18n.h" 16 #include "kmp_itt.h" 17 #include "kmp_stats.h" 18 #include "kmp_str.h" 19 #include "kmp_collapse.h" 20 21 #if OMPT_SUPPORT 22 #include "ompt-specific.h" 23 #endif 24 25 // OMPTODO: different style of comments (see kmp_sched) 26 // OMPTODO: OMPT/OMPD 27 28 // avoid inadevertently using a library based abs 29 template <typename T> T __kmp_abs(const T val) { 30 return (val < 0) ? -val : val; 31 } 32 kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; } 33 kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; } 34 35 //---------------------------------------------------------------------------- 36 // Common functions for working with rectangular and non-rectangular loops 37 //---------------------------------------------------------------------------- 38 39 template <typename T> int __kmp_sign(T val) { 40 return (T(0) < val) - (val < T(0)); 41 } 42 43 template <typename T> class CollapseAllocator { 44 typedef T *pT; 45 46 private: 47 static const size_t allocaSize = 32; // size limit for stack allocations 48 // (8 bytes x 4 nested loops) 49 char stackAlloc[allocaSize]; 50 static constexpr size_t maxElemCount = allocaSize / sizeof(T); 51 pT pTAlloc; 52 53 public: 54 CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) { 55 if (n > maxElemCount) { 56 pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T))); 57 } 58 } 59 ~CollapseAllocator() { 60 if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) { 61 __kmp_free(pTAlloc); 62 } 63 } 64 T &operator[](int index) { return pTAlloc[index]; } 65 operator const pT() { return pTAlloc; } 66 }; 67 68 //----------Loop canonicalization--------------------------------------------- 69 70 // For loop nest (any shape): 71 // convert != to < or >; 72 // switch from using < or > to <= or >=. 73 // "bounds" array has to be allocated per thread. 74 // All other internal functions will work only with canonicalized loops. 75 template <typename T> 76 void kmp_canonicalize_one_loop_XX( 77 ident_t *loc, 78 /*in/out*/ bounds_infoXX_template<T> *bounds) { 79 80 if (__kmp_env_consistency_check) { 81 if (bounds->step == 0) { 82 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 83 loc); 84 } 85 } 86 87 if (bounds->comparison == comparison_t::comp_not_eq) { 88 // We can convert this to < or >, depends on the sign of the step: 89 if (bounds->step > 0) { 90 bounds->comparison = comparison_t::comp_less; 91 } else { 92 bounds->comparison = comparison_t::comp_greater; 93 } 94 } 95 96 if (bounds->comparison == comparison_t::comp_less) { 97 // Note: ub0 can be unsigned. Should be Ok to hit overflow here, 98 // because ub0 + ub1*j should be still positive (otherwise loop was not 99 // well formed) 100 bounds->ub0 -= 1; 101 bounds->comparison = comparison_t::comp_less_or_eq; 102 } else if (bounds->comparison == comparison_t::comp_greater) { 103 bounds->ub0 += 1; 104 bounds->comparison = comparison_t::comp_greater_or_eq; 105 } 106 } 107 108 // Canonicalize loop nest. original_bounds_nest is an array of length n. 109 void kmp_canonicalize_loop_nest(ident_t *loc, 110 /*in/out*/ bounds_info_t *original_bounds_nest, 111 kmp_index_t n) { 112 113 for (kmp_index_t ind = 0; ind < n; ++ind) { 114 auto bounds = &(original_bounds_nest[ind]); 115 116 switch (bounds->loop_type) { 117 case loop_type_t::loop_type_int32: 118 kmp_canonicalize_one_loop_XX<kmp_int32>( 119 loc, 120 /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds)); 121 break; 122 case loop_type_t::loop_type_uint32: 123 kmp_canonicalize_one_loop_XX<kmp_uint32>( 124 loc, 125 /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds)); 126 break; 127 case loop_type_t::loop_type_int64: 128 kmp_canonicalize_one_loop_XX<kmp_int64>( 129 loc, 130 /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds)); 131 break; 132 case loop_type_t::loop_type_uint64: 133 kmp_canonicalize_one_loop_XX<kmp_uint64>( 134 loc, 135 /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds)); 136 break; 137 default: 138 KMP_ASSERT(false); 139 } 140 } 141 } 142 143 //----------Calculating trip count on one level------------------------------- 144 145 // Calculate trip count on this loop level. 146 // We do this either for a rectangular loop nest, 147 // or after an adjustment bringing the loops to a parallelepiped shape. 148 // This number should not depend on the value of outer IV 149 // even if the formular has lb1 and ub1. 150 // Note: for non-rectangular loops don't use span for this, it's too big. 151 152 template <typename T> 153 kmp_loop_nest_iv_t kmp_calculate_trip_count_XX( 154 /*in/out*/ bounds_infoXX_template<T> *bounds) { 155 156 if (bounds->comparison == comparison_t::comp_less_or_eq) { 157 if (bounds->ub0 < bounds->lb0) { 158 // Note: after this we don't need to calculate inner loops, 159 // but that should be an edge case: 160 bounds->trip_count = 0; 161 } else { 162 // ub - lb may exceed signed type range; we need to cast to 163 // kmp_loop_nest_iv_t anyway 164 bounds->trip_count = 165 static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) / 166 __kmp_abs(bounds->step) + 167 1; 168 } 169 } else if (bounds->comparison == comparison_t::comp_greater_or_eq) { 170 if (bounds->lb0 < bounds->ub0) { 171 // Note: after this we don't need to calculate inner loops, 172 // but that should be an edge case: 173 bounds->trip_count = 0; 174 } else { 175 // lb - ub may exceed signed type range; we need to cast to 176 // kmp_loop_nest_iv_t anyway 177 bounds->trip_count = 178 static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) / 179 __kmp_abs(bounds->step) + 180 1; 181 } 182 } else { 183 KMP_ASSERT(false); 184 } 185 return bounds->trip_count; 186 } 187 188 // Calculate trip count on this loop level. 189 kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) { 190 191 kmp_loop_nest_iv_t trip_count = 0; 192 193 switch (bounds->loop_type) { 194 case loop_type_t::loop_type_int32: 195 trip_count = kmp_calculate_trip_count_XX<kmp_int32>( 196 /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds)); 197 break; 198 case loop_type_t::loop_type_uint32: 199 trip_count = kmp_calculate_trip_count_XX<kmp_uint32>( 200 /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds)); 201 break; 202 case loop_type_t::loop_type_int64: 203 trip_count = kmp_calculate_trip_count_XX<kmp_int64>( 204 /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds)); 205 break; 206 case loop_type_t::loop_type_uint64: 207 trip_count = kmp_calculate_trip_count_XX<kmp_uint64>( 208 /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds)); 209 break; 210 default: 211 KMP_ASSERT(false); 212 } 213 214 return trip_count; 215 } 216 217 //----------Trim original iv according to its type---------------------------- 218 219 // Trim original iv according to its type. 220 // Return kmp_uint64 value which can be easily used in all internal calculations 221 // And can be statically cast back to original type in user code. 222 kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) { 223 kmp_uint64 res = 0; 224 225 switch (loop_iv_type) { 226 case loop_type_t::loop_type_int8: 227 res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv)); 228 break; 229 case loop_type_t::loop_type_uint8: 230 res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv)); 231 break; 232 case loop_type_t::loop_type_int16: 233 res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv)); 234 break; 235 case loop_type_t::loop_type_uint16: 236 res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv)); 237 break; 238 case loop_type_t::loop_type_int32: 239 res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv)); 240 break; 241 case loop_type_t::loop_type_uint32: 242 res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv)); 243 break; 244 case loop_type_t::loop_type_int64: 245 res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv)); 246 break; 247 case loop_type_t::loop_type_uint64: 248 res = static_cast<kmp_uint64>(original_iv); 249 break; 250 default: 251 KMP_ASSERT(false); 252 } 253 254 return res; 255 } 256 257 //----------Compare two IVs (remember they have a type)----------------------- 258 259 bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1, 260 kmp_uint64 original_iv2) { 261 bool res = false; 262 263 switch (loop_iv_type) { 264 case loop_type_t::loop_type_int8: 265 res = static_cast<kmp_int8>(original_iv1) == 266 static_cast<kmp_int8>(original_iv2); 267 break; 268 case loop_type_t::loop_type_uint8: 269 res = static_cast<kmp_uint8>(original_iv1) == 270 static_cast<kmp_uint8>(original_iv2); 271 break; 272 case loop_type_t::loop_type_int16: 273 res = static_cast<kmp_int16>(original_iv1) == 274 static_cast<kmp_int16>(original_iv2); 275 break; 276 case loop_type_t::loop_type_uint16: 277 res = static_cast<kmp_uint16>(original_iv1) == 278 static_cast<kmp_uint16>(original_iv2); 279 break; 280 case loop_type_t::loop_type_int32: 281 res = static_cast<kmp_int32>(original_iv1) == 282 static_cast<kmp_int32>(original_iv2); 283 break; 284 case loop_type_t::loop_type_uint32: 285 res = static_cast<kmp_uint32>(original_iv1) == 286 static_cast<kmp_uint32>(original_iv2); 287 break; 288 case loop_type_t::loop_type_int64: 289 res = static_cast<kmp_int64>(original_iv1) == 290 static_cast<kmp_int64>(original_iv2); 291 break; 292 case loop_type_t::loop_type_uint64: 293 res = static_cast<kmp_uint64>(original_iv1) == 294 static_cast<kmp_uint64>(original_iv2); 295 break; 296 default: 297 KMP_ASSERT(false); 298 } 299 300 return res; 301 } 302 303 //----------Calculate original iv on one level-------------------------------- 304 305 // Return true if the point fits into upper bounds on this level, 306 // false otherwise 307 template <typename T> 308 bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds, 309 const kmp_point_t original_ivs, 310 kmp_index_t ind) { 311 312 T iv = static_cast<T>(original_ivs[ind]); 313 T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); 314 315 if (((bounds->comparison == comparison_t::comp_less_or_eq) && 316 (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) || 317 ((bounds->comparison == comparison_t::comp_greater_or_eq) && 318 (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) { 319 // The calculated point is outside of loop upper boundary: 320 return false; 321 } 322 323 return true; 324 } 325 326 // Calculate one iv corresponding to iteration on the level ind. 327 // Return true if it fits into lower-upper bounds on this level 328 // (if not, we need to re-calculate) 329 template <typename T> 330 bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds, 331 /*in/out*/ kmp_point_t original_ivs, 332 const kmp_iterations_t iterations, kmp_index_t ind, 333 bool start_with_lower_bound, bool checkBounds) { 334 335 kmp_uint64 temp = 0; 336 T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); 337 338 if (start_with_lower_bound) { 339 // we moved to the next iteration on one of outer loops, should start 340 // with the lower bound here: 341 temp = bounds->lb0 + bounds->lb1 * outer_iv; 342 } else { 343 auto iteration = iterations[ind]; 344 temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step; 345 } 346 347 // Now trim original iv according to its type: 348 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); 349 350 if (checkBounds) { 351 return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind); 352 } else { 353 return true; 354 } 355 } 356 357 bool kmp_calc_one_iv(const bounds_info_t *bounds, 358 /*in/out*/ kmp_point_t original_ivs, 359 const kmp_iterations_t iterations, kmp_index_t ind, 360 bool start_with_lower_bound, bool checkBounds) { 361 362 switch (bounds->loop_type) { 363 case loop_type_t::loop_type_int32: 364 return kmp_calc_one_iv_XX<kmp_int32>( 365 (bounds_infoXX_template<kmp_int32> *)(bounds), 366 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, 367 checkBounds); 368 break; 369 case loop_type_t::loop_type_uint32: 370 return kmp_calc_one_iv_XX<kmp_uint32>( 371 (bounds_infoXX_template<kmp_uint32> *)(bounds), 372 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, 373 checkBounds); 374 break; 375 case loop_type_t::loop_type_int64: 376 return kmp_calc_one_iv_XX<kmp_int64>( 377 (bounds_infoXX_template<kmp_int64> *)(bounds), 378 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, 379 checkBounds); 380 break; 381 case loop_type_t::loop_type_uint64: 382 return kmp_calc_one_iv_XX<kmp_uint64>( 383 (bounds_infoXX_template<kmp_uint64> *)(bounds), 384 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, 385 checkBounds); 386 break; 387 default: 388 KMP_ASSERT(false); 389 return false; 390 } 391 } 392 393 //----------Calculate original iv on one level for rectangular loop nest------ 394 395 // Calculate one iv corresponding to iteration on the level ind. 396 // Return true if it fits into lower-upper bounds on this level 397 // (if not, we need to re-calculate) 398 template <typename T> 399 void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds, 400 /*in/out*/ kmp_uint64 *original_ivs, 401 const kmp_iterations_t iterations, 402 kmp_index_t ind) { 403 404 auto iteration = iterations[ind]; 405 406 kmp_uint64 temp = 407 bounds->lb0 + 408 bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) + 409 iteration * bounds->step; 410 411 // Now trim original iv according to its type: 412 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); 413 } 414 415 void kmp_calc_one_iv_rectang(const bounds_info_t *bounds, 416 /*in/out*/ kmp_uint64 *original_ivs, 417 const kmp_iterations_t iterations, 418 kmp_index_t ind) { 419 420 switch (bounds->loop_type) { 421 case loop_type_t::loop_type_int32: 422 kmp_calc_one_iv_rectang_XX<kmp_int32>( 423 (bounds_infoXX_template<kmp_int32> *)(bounds), 424 /*in/out*/ original_ivs, iterations, ind); 425 break; 426 case loop_type_t::loop_type_uint32: 427 kmp_calc_one_iv_rectang_XX<kmp_uint32>( 428 (bounds_infoXX_template<kmp_uint32> *)(bounds), 429 /*in/out*/ original_ivs, iterations, ind); 430 break; 431 case loop_type_t::loop_type_int64: 432 kmp_calc_one_iv_rectang_XX<kmp_int64>( 433 (bounds_infoXX_template<kmp_int64> *)(bounds), 434 /*in/out*/ original_ivs, iterations, ind); 435 break; 436 case loop_type_t::loop_type_uint64: 437 kmp_calc_one_iv_rectang_XX<kmp_uint64>( 438 (bounds_infoXX_template<kmp_uint64> *)(bounds), 439 /*in/out*/ original_ivs, iterations, ind); 440 break; 441 default: 442 KMP_ASSERT(false); 443 } 444 } 445 446 //---------------------------------------------------------------------------- 447 // Rectangular loop nest 448 //---------------------------------------------------------------------------- 449 450 //----------Canonicalize loop nest and calculate trip count------------------- 451 452 // Canonicalize loop nest and calculate overall trip count. 453 // "bounds_nest" has to be allocated per thread. 454 // API will modify original bounds_nest array to bring it to a canonical form 455 // (only <= and >=, no !=, <, >). If the original loop nest was already in a 456 // canonical form there will be no changes to bounds in bounds_nest array 457 // (only trip counts will be calculated). 458 // Returns trip count of overall space. 459 extern "C" kmp_loop_nest_iv_t 460 __kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid, 461 /*in/out*/ bounds_info_t *original_bounds_nest, 462 kmp_index_t n) { 463 464 kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n); 465 466 kmp_loop_nest_iv_t total = 1; 467 468 for (kmp_index_t ind = 0; ind < n; ++ind) { 469 auto bounds = &(original_bounds_nest[ind]); 470 471 kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds); 472 total *= trip_count; 473 } 474 475 return total; 476 } 477 478 //----------Calculate old induction variables--------------------------------- 479 480 // Calculate old induction variables corresponding to overall new_iv. 481 // Note: original IV will be returned as if it had kmp_uint64 type, 482 // will have to be converted to original type in user code. 483 // Note: trip counts should be already calculated by 484 // __kmpc_process_loop_nest_rectang. 485 // OMPTODO: special case 2, 3 nested loops: either do different 486 // interface without array or possibly template this over n 487 extern "C" void 488 __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv, 489 const bounds_info_t *original_bounds_nest, 490 /*out*/ kmp_uint64 *original_ivs, 491 kmp_index_t n) { 492 493 CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); 494 495 // First, calc corresponding iteration in every original loop: 496 for (kmp_index_t ind = n; ind > 0;) { 497 --ind; 498 auto bounds = &(original_bounds_nest[ind]); 499 500 // should be optimized to OPDIVREM: 501 auto temp = new_iv / bounds->trip_count; 502 auto iteration = new_iv % bounds->trip_count; 503 new_iv = temp; 504 505 iterations[ind] = iteration; 506 } 507 KMP_ASSERT(new_iv == 0); 508 509 for (kmp_index_t ind = 0; ind < n; ++ind) { 510 auto bounds = &(original_bounds_nest[ind]); 511 512 kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind); 513 } 514 } 515 516 //---------------------------------------------------------------------------- 517 // Non-rectangular loop nest 518 //---------------------------------------------------------------------------- 519 520 //----------Calculate maximum possible span of iv values on one level--------- 521 522 // Calculate span for IV on this loop level for "<=" case. 523 // Note: it's for <= on this loop nest level, so lower bound should be smallest 524 // value, upper bound should be the biggest value. If the loop won't execute, 525 // 'smallest' may be bigger than 'biggest', but we'd better not switch them 526 // around. 527 template <typename T> 528 void kmp_calc_span_lessoreq_XX( 529 /* in/out*/ bounds_info_internalXX_template<T> *bounds, 530 /* in/out*/ bounds_info_internal_t *bounds_nest) { 531 532 typedef typename traits_t<T>::unsigned_t UT; 533 // typedef typename traits_t<T>::signed_t ST; 534 535 // typedef typename big_span_t span_t; 536 typedef T span_t; 537 538 auto &bbounds = bounds->b; 539 540 if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) { 541 // This dimention depends on one of previous ones; can't be the outermost 542 // one. 543 bounds_info_internalXX_template<T> *previous = 544 reinterpret_cast<bounds_info_internalXX_template<T> *>( 545 &(bounds_nest[bbounds.outer_iv])); 546 547 // OMPTODO: assert that T is compatible with loop variable type on 548 // 'previous' loop 549 550 { 551 span_t bound_candidate1 = 552 bbounds.lb0 + bbounds.lb1 * previous->span_smallest; 553 span_t bound_candidate2 = 554 bbounds.lb0 + bbounds.lb1 * previous->span_biggest; 555 if (bound_candidate1 < bound_candidate2) { 556 bounds->span_smallest = bound_candidate1; 557 } else { 558 bounds->span_smallest = bound_candidate2; 559 } 560 } 561 562 { 563 // We can't adjust the upper bound with respect to step, because 564 // lower bound might be off after adjustments 565 566 span_t bound_candidate1 = 567 bbounds.ub0 + bbounds.ub1 * previous->span_smallest; 568 span_t bound_candidate2 = 569 bbounds.ub0 + bbounds.ub1 * previous->span_biggest; 570 if (bound_candidate1 < bound_candidate2) { 571 bounds->span_biggest = bound_candidate2; 572 } else { 573 bounds->span_biggest = bound_candidate1; 574 } 575 } 576 } else { 577 // Rectangular: 578 bounds->span_smallest = bbounds.lb0; 579 bounds->span_biggest = bbounds.ub0; 580 } 581 if (!bounds->loop_bounds_adjusted) { 582 // Here it's safe to reduce the space to the multiply of step. 583 // OMPTODO: check if the formular is correct. 584 // Also check if it would be safe to do this if we didn't adjust left side. 585 bounds->span_biggest -= 586 (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs? 587 } 588 } 589 590 // Calculate span for IV on this loop level for ">=" case. 591 template <typename T> 592 void kmp_calc_span_greateroreq_XX( 593 /* in/out*/ bounds_info_internalXX_template<T> *bounds, 594 /* in/out*/ bounds_info_internal_t *bounds_nest) { 595 596 typedef typename traits_t<T>::unsigned_t UT; 597 // typedef typename traits_t<T>::signed_t ST; 598 599 // typedef typename big_span_t span_t; 600 typedef T span_t; 601 602 auto &bbounds = bounds->b; 603 604 if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) { 605 // This dimention depends on one of previous ones; can't be the outermost 606 // one. 607 bounds_info_internalXX_template<T> *previous = 608 reinterpret_cast<bounds_info_internalXX_template<T> *>( 609 &(bounds_nest[bbounds.outer_iv])); 610 611 // OMPTODO: assert that T is compatible with loop variable type on 612 // 'previous' loop 613 614 { 615 span_t bound_candidate1 = 616 bbounds.lb0 + bbounds.lb1 * previous->span_smallest; 617 span_t bound_candidate2 = 618 bbounds.lb0 + bbounds.lb1 * previous->span_biggest; 619 if (bound_candidate1 >= bound_candidate2) { 620 bounds->span_smallest = bound_candidate1; 621 } else { 622 bounds->span_smallest = bound_candidate2; 623 } 624 } 625 626 { 627 // We can't adjust the upper bound with respect to step, because 628 // lower bound might be off after adjustments 629 630 span_t bound_candidate1 = 631 bbounds.ub0 + bbounds.ub1 * previous->span_smallest; 632 span_t bound_candidate2 = 633 bbounds.ub0 + bbounds.ub1 * previous->span_biggest; 634 if (bound_candidate1 >= bound_candidate2) { 635 bounds->span_biggest = bound_candidate2; 636 } else { 637 bounds->span_biggest = bound_candidate1; 638 } 639 } 640 641 } else { 642 // Rectangular: 643 bounds->span_biggest = bbounds.lb0; 644 bounds->span_smallest = bbounds.ub0; 645 } 646 if (!bounds->loop_bounds_adjusted) { 647 // Here it's safe to reduce the space to the multiply of step. 648 // OMPTODO: check if the formular is correct. 649 // Also check if it would be safe to do this if we didn't adjust left side. 650 bounds->span_biggest -= 651 (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs? 652 } 653 } 654 655 // Calculate maximum possible span for IV on this loop level. 656 template <typename T> 657 void kmp_calc_span_XX( 658 /* in/out*/ bounds_info_internalXX_template<T> *bounds, 659 /* in/out*/ bounds_info_internal_t *bounds_nest) { 660 661 if (bounds->b.comparison == comparison_t::comp_less_or_eq) { 662 kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); 663 } else { 664 KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq); 665 kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); 666 } 667 } 668 669 //----------All initial processing of the loop nest--------------------------- 670 671 // Calculate new bounds for this loop level. 672 // To be able to work with the nest we need to get it to a parallelepiped shape. 673 // We need to stay in the original range of values, so that there will be no 674 // overflow, for that we'll adjust both upper and lower bounds as needed. 675 template <typename T> 676 void kmp_calc_new_bounds_XX( 677 /* in/out*/ bounds_info_internalXX_template<T> *bounds, 678 /* in/out*/ bounds_info_internal_t *bounds_nest) { 679 680 auto &bbounds = bounds->b; 681 682 if (bbounds.lb1 == bbounds.ub1) { 683 // Already parallel, no need to adjust: 684 bounds->loop_bounds_adjusted = false; 685 } else { 686 bounds->loop_bounds_adjusted = true; 687 688 T old_lb1 = bbounds.lb1; 689 T old_ub1 = bbounds.ub1; 690 691 if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) { 692 // With this shape we can adjust to a rectangle: 693 bbounds.lb1 = 0; 694 bbounds.ub1 = 0; 695 } else { 696 // get upper and lower bounds to be parallel 697 // with values in the old range. 698 // Note: abs didn't work here. 699 if (((old_lb1 < 0) && (old_lb1 < old_ub1)) || 700 ((old_lb1 > 0) && (old_lb1 > old_ub1))) { 701 bbounds.lb1 = old_ub1; 702 } else { 703 bbounds.ub1 = old_lb1; 704 } 705 } 706 707 // Now need to adjust lb0, ub0, otherwise in some cases space will shrink. 708 // The idea here that for this IV we are now getting the same span 709 // irrespective of the previous IV value. 710 bounds_info_internalXX_template<T> *previous = 711 reinterpret_cast<bounds_info_internalXX_template<T> *>( 712 &bounds_nest[bbounds.outer_iv]); 713 714 if (bbounds.comparison == comparison_t::comp_less_or_eq) { 715 if (old_lb1 < bbounds.lb1) { 716 KMP_ASSERT(old_lb1 < 0); 717 // The length is good on outer_iv biggest number, 718 // can use it to find where to move the lower bound: 719 720 T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest; 721 bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space? 722 // e.g. it was 0?? (same below) 723 } else if (old_lb1 > bbounds.lb1) { 724 // still need to move lower bound: 725 T add = (old_lb1 - bbounds.lb1) * previous->span_smallest; 726 bbounds.lb0 += add; 727 } 728 729 if (old_ub1 > bbounds.ub1) { 730 KMP_ASSERT(old_ub1 > 0); 731 // The length is good on outer_iv biggest number, 732 // can use it to find where to move upper bound: 733 734 T add = (old_ub1 - bbounds.ub1) * previous->span_biggest; 735 bbounds.ub0 += add; 736 } else if (old_ub1 < bbounds.ub1) { 737 // still need to move upper bound: 738 T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest; 739 bbounds.ub0 -= sub; 740 } 741 } else { 742 KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq); 743 if (old_lb1 < bbounds.lb1) { 744 KMP_ASSERT(old_lb1 < 0); 745 T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest; 746 bbounds.lb0 -= sub; 747 } else if (old_lb1 > bbounds.lb1) { 748 T add = (old_lb1 - bbounds.lb1) * previous->span_biggest; 749 bbounds.lb0 += add; 750 } 751 752 if (old_ub1 > bbounds.ub1) { 753 KMP_ASSERT(old_ub1 > 0); 754 T add = (old_ub1 - bbounds.ub1) * previous->span_smallest; 755 bbounds.ub0 += add; 756 } else if (old_ub1 < bbounds.ub1) { 757 T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest; 758 bbounds.ub0 -= sub; 759 } 760 } 761 } 762 } 763 764 // Do all processing for one canonicalized loop in the nest 765 // (assuming that outer loops already were processed): 766 template <typename T> 767 kmp_loop_nest_iv_t kmp_process_one_loop_XX( 768 /* in/out*/ bounds_info_internalXX_template<T> *bounds, 769 /*in/out*/ bounds_info_internal_t *bounds_nest) { 770 771 kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); 772 kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); 773 return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b)); 774 } 775 776 // Non-rectangular loop nest, canonicalized to use <= or >=. 777 // Process loop nest to have a parallelepiped shape, 778 // calculate biggest spans for IV's on all levels and calculate overall trip 779 // count. "bounds_nest" has to be allocated per thread. 780 // Returns overall trip count (for adjusted space). 781 kmp_loop_nest_iv_t kmp_process_loop_nest( 782 /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) { 783 784 kmp_loop_nest_iv_t total = 1; 785 786 for (kmp_index_t ind = 0; ind < n; ++ind) { 787 auto bounds = &(bounds_nest[ind]); 788 kmp_loop_nest_iv_t trip_count = 0; 789 790 switch (bounds->b.loop_type) { 791 case loop_type_t::loop_type_int32: 792 trip_count = kmp_process_one_loop_XX<kmp_int32>( 793 /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds), 794 /*in/out*/ bounds_nest); 795 break; 796 case loop_type_t::loop_type_uint32: 797 trip_count = kmp_process_one_loop_XX<kmp_uint32>( 798 /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds), 799 /*in/out*/ bounds_nest); 800 break; 801 case loop_type_t::loop_type_int64: 802 trip_count = kmp_process_one_loop_XX<kmp_int64>( 803 /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds), 804 /*in/out*/ bounds_nest); 805 break; 806 case loop_type_t::loop_type_uint64: 807 trip_count = kmp_process_one_loop_XX<kmp_uint64>( 808 /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds), 809 /*in/out*/ bounds_nest); 810 break; 811 default: 812 KMP_ASSERT(false); 813 } 814 total *= trip_count; 815 } 816 817 return total; 818 } 819 820 //----------Calculate iterations (in the original or updated space)----------- 821 822 // Calculate number of iterations in original or updated space resulting in 823 // original_ivs[ind] (only on this level, non-negative) 824 // (not counting initial iteration) 825 template <typename T> 826 kmp_loop_nest_iv_t 827 kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds, 828 const kmp_point_t original_ivs, 829 kmp_index_t ind) { 830 831 kmp_loop_nest_iv_t iterations = 0; 832 833 if (bounds->comparison == comparison_t::comp_less_or_eq) { 834 iterations = 835 (static_cast<T>(original_ivs[ind]) - bounds->lb0 - 836 bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) / 837 __kmp_abs(bounds->step); 838 } else { 839 KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq); 840 iterations = (bounds->lb0 + 841 bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) - 842 static_cast<T>(original_ivs[ind])) / 843 __kmp_abs(bounds->step); 844 } 845 846 return iterations; 847 } 848 849 // Calculate number of iterations in the original or updated space resulting in 850 // original_ivs[ind] (only on this level, non-negative) 851 kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds, 852 const kmp_point_t original_ivs, 853 kmp_index_t ind) { 854 855 switch (bounds->loop_type) { 856 case loop_type_t::loop_type_int32: 857 return kmp_calc_number_of_iterations_XX<kmp_int32>( 858 (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind); 859 break; 860 case loop_type_t::loop_type_uint32: 861 return kmp_calc_number_of_iterations_XX<kmp_uint32>( 862 (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind); 863 break; 864 case loop_type_t::loop_type_int64: 865 return kmp_calc_number_of_iterations_XX<kmp_int64>( 866 (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind); 867 break; 868 case loop_type_t::loop_type_uint64: 869 return kmp_calc_number_of_iterations_XX<kmp_uint64>( 870 (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind); 871 break; 872 default: 873 KMP_ASSERT(false); 874 return 0; 875 } 876 } 877 878 //----------Calculate new iv corresponding to original ivs-------------------- 879 880 // We got a point in the original loop nest. 881 // Take updated bounds and calculate what new_iv will correspond to this point. 882 // When we are getting original IVs from new_iv, we have to adjust to fit into 883 // original loops bounds. Getting new_iv for the adjusted original IVs will help 884 // with making more chunks non-empty. 885 kmp_loop_nest_iv_t 886 kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest, 887 const kmp_point_t original_ivs, 888 kmp_index_t n) { 889 890 kmp_loop_nest_iv_t new_iv = 0; 891 892 for (kmp_index_t ind = 0; ind < n; ++ind) { 893 auto bounds = &(bounds_nest[ind].b); 894 895 new_iv = new_iv * bounds->trip_count + 896 kmp_calc_number_of_iterations(bounds, original_ivs, ind); 897 } 898 899 return new_iv; 900 } 901 902 //----------Calculate original ivs for provided iterations-------------------- 903 904 // Calculate original IVs for provided iterations, assuming iterations are 905 // calculated in the original space. 906 // Loop nest is in canonical form (with <= / >=). 907 bool kmp_calc_original_ivs_from_iterations( 908 const bounds_info_t *original_bounds_nest, kmp_index_t n, 909 /*in/out*/ kmp_point_t original_ivs, 910 /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) { 911 912 kmp_index_t lengthened_ind = n; 913 914 for (; ind < n;) { 915 auto bounds = &(original_bounds_nest[ind]); 916 bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations, 917 ind, (lengthened_ind < ind), true); 918 919 if (!good) { 920 // The calculated iv value is too big (or too small for >=): 921 if (ind == 0) { 922 // Space is empty: 923 return false; 924 } else { 925 // Go to next iteration on the outer loop: 926 --ind; 927 ++iterations[ind]; 928 lengthened_ind = ind; 929 for (kmp_index_t i = ind + 1; i < n; ++i) { 930 iterations[i] = 0; 931 } 932 continue; 933 } 934 } 935 ++ind; 936 } 937 938 return true; 939 } 940 941 //----------Calculate original ivs for the beginning of the loop nest--------- 942 943 // Calculate IVs for the beginning of the loop nest. 944 // Note: lower bounds of all loops may not work - 945 // if on some of the iterations of the outer loops inner loops are empty. 946 // Loop nest is in canonical form (with <= / >=). 947 bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest, 948 kmp_index_t n, 949 /*out*/ kmp_point_t original_ivs) { 950 951 // Iterations in the original space, multiplied by step: 952 CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); 953 for (kmp_index_t ind = n; ind > 0;) { 954 --ind; 955 iterations[ind] = 0; 956 } 957 958 // Now calculate the point: 959 bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n, 960 /*in/out*/ original_ivs, 961 /*in/out*/ iterations, 0); 962 return b; 963 } 964 965 //----------Calculate next point in the original loop space------------------- 966 967 // From current set of original IVs calculate next point. 968 // Return false if there is no next point in the loop bounds. 969 bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest, 970 kmp_index_t n, const kmp_point_t original_ivs, 971 /*out*/ kmp_point_t next_original_ivs) { 972 // Iterations in the original space, multiplied by step (so can be negative): 973 CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); 974 // First, calc corresponding iteration in every original loop: 975 for (kmp_index_t ind = 0; ind < n; ++ind) { 976 auto bounds = &(original_bounds_nest[ind]); 977 iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind); 978 } 979 980 for (kmp_index_t ind = 0; ind < n; ++ind) { 981 next_original_ivs[ind] = original_ivs[ind]; 982 } 983 984 // Next add one step to the iterations on the inner-most level, and see if we 985 // need to move up the nest: 986 kmp_index_t ind = n - 1; 987 ++iterations[ind]; 988 989 bool b = kmp_calc_original_ivs_from_iterations( 990 original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind); 991 992 return b; 993 } 994 995 //----------Calculate chunk end in the original loop space-------------------- 996 997 // For one level calculate old induction variable corresponding to overall 998 // new_iv for the chunk end. 999 // Return true if it fits into upper bound on this level 1000 // (if not, we need to re-calculate) 1001 template <typename T> 1002 bool kmp_calc_one_iv_for_chunk_end_XX( 1003 const bounds_infoXX_template<T> *bounds, 1004 const bounds_infoXX_template<T> *updated_bounds, 1005 /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations, 1006 kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start, 1007 const kmp_point_t original_ivs_start) { 1008 1009 // typedef std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64> 1010 // big_span_t; 1011 1012 // OMPTODO: is it good enough, or do we need ST or do we need big_span_t? 1013 T temp = 0; 1014 1015 T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); 1016 1017 if (start_with_lower_bound) { 1018 // we moved to the next iteration on one of outer loops, may as well use 1019 // the lower bound here: 1020 temp = bounds->lb0 + bounds->lb1 * outer_iv; 1021 } else { 1022 // Start in expanded space, but: 1023 // - we need to hit original space lower bound, so need to account for 1024 // that 1025 // - we have to go into original space, even if that means adding more 1026 // iterations than was planned 1027 // - we have to go past (or equal to) previous point (which is the chunk 1028 // starting point) 1029 1030 auto iteration = iterations[ind]; 1031 1032 auto step = bounds->step; 1033 1034 // In case of >= it's negative: 1035 auto accountForStep = 1036 ((bounds->lb0 + bounds->lb1 * outer_iv) - 1037 (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) % 1038 step; 1039 1040 temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv + 1041 accountForStep + iteration * step; 1042 1043 if (((bounds->comparison == comparison_t::comp_less_or_eq) && 1044 (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) || 1045 ((bounds->comparison == comparison_t::comp_greater_or_eq) && 1046 (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) { 1047 // Too small (or too big), didn't reach the original lower bound. Use 1048 // heuristic: 1049 temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step; 1050 } 1051 1052 if (compare_with_start) { 1053 1054 T start = static_cast<T>(original_ivs_start[ind]); 1055 1056 temp = kmp_fix_iv(bounds->loop_iv_type, temp); 1057 1058 // On all previous levels start of the chunk is same as the end, need to 1059 // be really careful here: 1060 if (((bounds->comparison == comparison_t::comp_less_or_eq) && 1061 (temp < start)) || 1062 ((bounds->comparison == comparison_t::comp_greater_or_eq) && 1063 (temp > start))) { 1064 // End of the chunk can't be smaller (for >= bigger) than it's start. 1065 // Use heuristic: 1066 temp = start + iteration / 4 * step; 1067 } 1068 } 1069 } 1070 1071 original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp); 1072 1073 if (((bounds->comparison == comparison_t::comp_less_or_eq) && 1074 (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) || 1075 ((bounds->comparison == comparison_t::comp_greater_or_eq) && 1076 (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) { 1077 // Too big (or too small for >=). 1078 return false; 1079 } 1080 1081 return true; 1082 } 1083 1084 // For one level calculate old induction variable corresponding to overall 1085 // new_iv for the chunk end. 1086 bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds, 1087 const bounds_info_t *updated_bounds, 1088 /*in/out*/ kmp_point_t original_ivs, 1089 const kmp_iterations_t iterations, 1090 kmp_index_t ind, bool start_with_lower_bound, 1091 bool compare_with_start, 1092 const kmp_point_t original_ivs_start) { 1093 1094 switch (bounds->loop_type) { 1095 case loop_type_t::loop_type_int32: 1096 return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>( 1097 (bounds_infoXX_template<kmp_int32> *)(bounds), 1098 (bounds_infoXX_template<kmp_int32> *)(updated_bounds), 1099 /*in/out*/ 1100 original_ivs, iterations, ind, start_with_lower_bound, 1101 compare_with_start, original_ivs_start); 1102 break; 1103 case loop_type_t::loop_type_uint32: 1104 return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>( 1105 (bounds_infoXX_template<kmp_uint32> *)(bounds), 1106 (bounds_infoXX_template<kmp_uint32> *)(updated_bounds), 1107 /*in/out*/ 1108 original_ivs, iterations, ind, start_with_lower_bound, 1109 compare_with_start, original_ivs_start); 1110 break; 1111 case loop_type_t::loop_type_int64: 1112 return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>( 1113 (bounds_infoXX_template<kmp_int64> *)(bounds), 1114 (bounds_infoXX_template<kmp_int64> *)(updated_bounds), 1115 /*in/out*/ 1116 original_ivs, iterations, ind, start_with_lower_bound, 1117 compare_with_start, original_ivs_start); 1118 break; 1119 case loop_type_t::loop_type_uint64: 1120 return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>( 1121 (bounds_infoXX_template<kmp_uint64> *)(bounds), 1122 (bounds_infoXX_template<kmp_uint64> *)(updated_bounds), 1123 /*in/out*/ 1124 original_ivs, iterations, ind, start_with_lower_bound, 1125 compare_with_start, original_ivs_start); 1126 break; 1127 default: 1128 KMP_ASSERT(false); 1129 return false; 1130 } 1131 } 1132 1133 // Calculate old induction variables corresponding to overall new_iv for the 1134 // chunk end. If due to space extension we are getting old IVs outside of the 1135 // boundaries, bring them into the boundaries. Need to do this in the runtime, 1136 // esp. on the lower bounds side. When getting result need to make sure that the 1137 // new chunk starts at next position to old chunk, not overlaps with it (this is 1138 // done elsewhere), and need to make sure end of the chunk is further than the 1139 // beginning of the chunk. We don't need an exact ending point here, just 1140 // something more-or-less close to the desired chunk length, bigger is fine 1141 // (smaller would be fine, but we risk going into infinite loop, so do smaller 1142 // only at the very end of the space). result: false if could not find the 1143 // ending point in the original loop space. In this case the caller can use 1144 // original upper bounds as the end of the chunk. Chunk won't be empty, because 1145 // it'll have at least the starting point, which is by construction in the 1146 // original space. 1147 bool kmp_calc_original_ivs_for_chunk_end( 1148 const bounds_info_t *original_bounds_nest, kmp_index_t n, 1149 const bounds_info_internal_t *updated_bounds_nest, 1150 const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv, 1151 /*out*/ kmp_point_t original_ivs) { 1152 1153 // Iterations in the expanded space: 1154 CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); 1155 // First, calc corresponding iteration in every modified loop: 1156 for (kmp_index_t ind = n; ind > 0;) { 1157 --ind; 1158 auto &updated_bounds = updated_bounds_nest[ind]; 1159 1160 // should be optimized to OPDIVREM: 1161 auto new_ind = new_iv / updated_bounds.b.trip_count; 1162 auto iteration = new_iv % updated_bounds.b.trip_count; 1163 1164 new_iv = new_ind; 1165 iterations[ind] = iteration; 1166 } 1167 KMP_DEBUG_ASSERT(new_iv == 0); 1168 1169 kmp_index_t lengthened_ind = n; 1170 kmp_index_t equal_ind = -1; 1171 1172 // Next calculate the point, but in original loop nest. 1173 for (kmp_index_t ind = 0; ind < n;) { 1174 auto bounds = &(original_bounds_nest[ind]); 1175 auto updated_bounds = &(updated_bounds_nest[ind].b); 1176 1177 bool good = kmp_calc_one_iv_for_chunk_end( 1178 bounds, updated_bounds, 1179 /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind), 1180 (equal_ind >= ind - 1), original_ivs_start); 1181 1182 if (!good) { 1183 // Too big (or too small for >=). 1184 if (ind == 0) { 1185 // Need to reduce to the end. 1186 return false; 1187 } else { 1188 // Go to next iteration on outer loop: 1189 --ind; 1190 ++(iterations[ind]); 1191 lengthened_ind = ind; 1192 if (equal_ind >= lengthened_ind) { 1193 // We've changed the number of iterations here, 1194 // can't be same anymore: 1195 equal_ind = lengthened_ind - 1; 1196 } 1197 for (kmp_index_t i = ind + 1; i < n; ++i) { 1198 iterations[i] = 0; 1199 } 1200 continue; 1201 } 1202 } 1203 1204 if ((equal_ind == ind - 1) && 1205 (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind], 1206 original_ivs_start[ind]))) { 1207 equal_ind = ind; 1208 } else if ((equal_ind > ind - 1) && 1209 !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind], 1210 original_ivs_start[ind]))) { 1211 equal_ind = ind - 1; 1212 } 1213 ++ind; 1214 } 1215 1216 return true; 1217 } 1218 1219 //----------Calculate upper bounds for the last chunk------------------------- 1220 1221 // Calculate one upper bound for the end. 1222 template <typename T> 1223 void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds, 1224 /*in/out*/ kmp_point_t original_ivs, 1225 kmp_index_t ind) { 1226 1227 T temp = bounds->ub0 + 1228 bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]); 1229 1230 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); 1231 } 1232 1233 void kmp_calc_one_iv_end(const bounds_info_t *bounds, 1234 /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) { 1235 1236 switch (bounds->loop_type) { 1237 default: 1238 KMP_ASSERT(false); 1239 break; 1240 case loop_type_t::loop_type_int32: 1241 kmp_calc_one_iv_end_XX<kmp_int32>( 1242 (bounds_infoXX_template<kmp_int32> *)(bounds), 1243 /*in/out*/ original_ivs, ind); 1244 break; 1245 case loop_type_t::loop_type_uint32: 1246 kmp_calc_one_iv_end_XX<kmp_uint32>( 1247 (bounds_infoXX_template<kmp_uint32> *)(bounds), 1248 /*in/out*/ original_ivs, ind); 1249 break; 1250 case loop_type_t::loop_type_int64: 1251 kmp_calc_one_iv_end_XX<kmp_int64>( 1252 (bounds_infoXX_template<kmp_int64> *)(bounds), 1253 /*in/out*/ original_ivs, ind); 1254 break; 1255 case loop_type_t::loop_type_uint64: 1256 kmp_calc_one_iv_end_XX<kmp_uint64>( 1257 (bounds_infoXX_template<kmp_uint64> *)(bounds), 1258 /*in/out*/ original_ivs, ind); 1259 break; 1260 } 1261 } 1262 1263 // Calculate upper bounds for the last loop iteration. Just use original upper 1264 // bounds (adjusted when canonicalized to use <= / >=). No need to check that 1265 // this point is in the original space (it's likely not) 1266 void kmp_calc_original_ivs_for_end( 1267 const bounds_info_t *const original_bounds_nest, kmp_index_t n, 1268 /*out*/ kmp_point_t original_ivs) { 1269 for (kmp_index_t ind = 0; ind < n; ++ind) { 1270 auto bounds = &(original_bounds_nest[ind]); 1271 kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind); 1272 } 1273 } 1274 1275 //----------Init API for non-rectangular loops-------------------------------- 1276 1277 // Init API for collapsed loops (static, no chunks defined). 1278 // "bounds_nest" has to be allocated per thread. 1279 // API will modify original bounds_nest array to bring it to a canonical form 1280 // (only <= and >=, no !=, <, >). If the original loop nest was already in a 1281 // canonical form there will be no changes to bounds in bounds_nest array 1282 // (only trip counts will be calculated). Internally API will expand the space 1283 // to parallelogram/parallelepiped, calculate total, calculate bounds for the 1284 // chunks in terms of the new IV, re-calc them in terms of old IVs (especially 1285 // important on the left side, to hit the lower bounds and not step over), and 1286 // pick the correct chunk for this thread (so it will calculate chunks up to the 1287 // needed one). It could be optimized to calculate just this chunk, potentially 1288 // a bit less well distributed among threads. It is designed to make sure that 1289 // threads will receive predictable chunks, deterministically (so that next nest 1290 // of loops with similar characteristics will get exactly same chunks on same 1291 // threads). 1292 // Current contract: chunk_bounds_nest has only lb0 and ub0, 1293 // lb1 and ub1 are set to 0 and can be ignored. (This may change in the future). 1294 extern "C" kmp_int32 1295 __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, 1296 /*in/out*/ bounds_info_t *original_bounds_nest, 1297 /*out*/ bounds_info_t *chunk_bounds_nest, 1298 kmp_index_t n, /*out*/ kmp_int32 *plastiter) { 1299 1300 KMP_DEBUG_ASSERT(plastiter && original_bounds_nest); 1301 KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid)); 1302 1303 if (__kmp_env_consistency_check) { 1304 __kmp_push_workshare(gtid, ct_pdo, loc); 1305 } 1306 1307 kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n); 1308 1309 CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n); 1310 1311 for (kmp_index_t i = 0; i < n; ++i) { 1312 updated_bounds_nest[i].b = original_bounds_nest[i]; 1313 } 1314 1315 kmp_loop_nest_iv_t total = 1316 kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n); 1317 1318 if (plastiter != NULL) { 1319 *plastiter = FALSE; 1320 } 1321 1322 if (total == 0) { 1323 // Loop won't execute: 1324 return FALSE; 1325 } 1326 1327 // OMPTODO: DISTRIBUTE is not supported yet 1328 __kmp_assert_valid_gtid(gtid); 1329 kmp_uint32 tid = __kmp_tid_from_gtid(gtid); 1330 1331 kmp_info_t *th = __kmp_threads[gtid]; 1332 kmp_team_t *team = th->th.th_team; 1333 kmp_uint32 nth = team->t.t_nproc; // Number of threads 1334 1335 KMP_DEBUG_ASSERT(tid < nth); 1336 1337 CollapseAllocator<kmp_uint64> original_ivs_start(n); 1338 1339 if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n, 1340 /*out*/ original_ivs_start)) { 1341 // Loop won't execute: 1342 return FALSE; 1343 } 1344 1345 // Not doing this optimization for one thread: 1346 // (1) more to test 1347 // (2) without it current contract that chunk_bounds_nest has only lb0 and 1348 // ub0, lb1 and ub1 are set to 0 and can be ignored. 1349 // if (nth == 1) { 1350 // // One thread: 1351 // // Copy all info from original_bounds_nest, it'll be good enough. 1352 1353 // for (kmp_index_t i = 0; i < n; ++i) { 1354 // chunk_bounds_nest[i] = original_bounds_nest[i]; 1355 // } 1356 1357 // if (plastiter != NULL) { 1358 // *plastiter = TRUE; 1359 // } 1360 // return TRUE; 1361 //} 1362 1363 kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs( 1364 updated_bounds_nest, original_ivs_start, n); 1365 1366 bool last_iter = false; 1367 1368 for (; nth > 0;) { 1369 // We could calculate chunk size once, but this is to compensate that the 1370 // original space is not parallelepiped and some threads can be left 1371 // without work: 1372 KMP_DEBUG_ASSERT(total >= new_iv); 1373 1374 kmp_loop_nest_iv_t total_left = total - new_iv; 1375 kmp_loop_nest_iv_t chunk_size = total_left / nth; 1376 kmp_loop_nest_iv_t remainder = total_left % nth; 1377 1378 kmp_loop_nest_iv_t curr_chunk_size = chunk_size; 1379 1380 if (remainder > 0) { 1381 ++curr_chunk_size; 1382 --remainder; 1383 } 1384 1385 #if defined(KMP_DEBUG) 1386 kmp_loop_nest_iv_t new_iv_for_start = new_iv; 1387 #endif 1388 1389 if (curr_chunk_size > 1) { 1390 new_iv += curr_chunk_size - 1; 1391 } 1392 1393 CollapseAllocator<kmp_uint64> original_ivs_end(n); 1394 if ((nth == 1) || (new_iv >= total - 1)) { 1395 // Do this one till the end - just in case we miscalculated 1396 // and either too much is left to process or new_iv is a bit too big: 1397 kmp_calc_original_ivs_for_end(original_bounds_nest, n, 1398 /*out*/ original_ivs_end); 1399 1400 last_iter = true; 1401 } else { 1402 // Note: here we make sure it's past (or equal to) the previous point. 1403 if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n, 1404 updated_bounds_nest, 1405 original_ivs_start, new_iv, 1406 /*out*/ original_ivs_end)) { 1407 // We could not find the ending point, use the original upper bounds: 1408 kmp_calc_original_ivs_for_end(original_bounds_nest, n, 1409 /*out*/ original_ivs_end); 1410 1411 last_iter = true; 1412 } 1413 } 1414 1415 #if defined(KMP_DEBUG) 1416 auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs( 1417 updated_bounds_nest, original_ivs_end, n); 1418 KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start); 1419 #endif 1420 1421 if (last_iter && (tid != 0)) { 1422 // We are done, this was last chunk, but no chunk for current thread was 1423 // found: 1424 return FALSE; 1425 } 1426 1427 if (tid == 0) { 1428 // We found the chunk for this thread, now we need to check if it's the 1429 // last chunk or not: 1430 1431 CollapseAllocator<kmp_uint64> original_ivs_next_start(n); 1432 if (last_iter || 1433 !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end, 1434 /*out*/ original_ivs_next_start)) { 1435 // no more loop iterations left to process, 1436 // this means that currently found chunk is the last chunk: 1437 if (plastiter != NULL) { 1438 *plastiter = TRUE; 1439 } 1440 } 1441 1442 // Fill in chunk bounds: 1443 for (kmp_index_t i = 0; i < n; ++i) { 1444 chunk_bounds_nest[i] = 1445 original_bounds_nest[i]; // To fill in types, etc. - optional 1446 chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i]; 1447 chunk_bounds_nest[i].lb1_u64 = 0; 1448 1449 chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i]; 1450 chunk_bounds_nest[i].ub1_u64 = 0; 1451 } 1452 1453 return TRUE; 1454 } 1455 1456 --tid; 1457 --nth; 1458 1459 bool next_chunk = kmp_calc_next_original_ivs( 1460 original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start); 1461 if (!next_chunk) { 1462 // no more loop iterations to process, 1463 // the prevoius chunk was the last chunk 1464 break; 1465 } 1466 1467 // original_ivs_start is next to previous chunk original_ivs_end, 1468 // we need to start new chunk here, so chunks will be one after another 1469 // without any gap or overlap: 1470 new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest, 1471 original_ivs_start, n); 1472 } 1473 1474 return FALSE; 1475 } 1476