1 /* 2 * kmp_collapse.cpp -- loop collapse feature 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_error.h" 15 #include "kmp_i18n.h" 16 #include "kmp_itt.h" 17 #include "kmp_stats.h" 18 #include "kmp_str.h" 19 #include "kmp_collapse.h" 20 21 #if OMPT_SUPPORT 22 #include "ompt-specific.h" 23 #endif 24 25 // OMPTODO: different style of comments (see kmp_sched) 26 // OMPTODO: OMPT/OMPD 27 28 // avoid inadevertently using a library based abs 29 template <typename T> T __kmp_abs(const T val) { 30 return (val < 0) ? -val: val; 31 } 32 kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; } 33 kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; } 34 35 //---------------------------------------------------------------------------- 36 // Common functions for working with rectangular and non-rectangular loops 37 //---------------------------------------------------------------------------- 38 39 template <typename T> int __kmp_sign(T val) { return (T(0) < val) - (val < T(0)); } 40 41 //----------Loop canonicalization--------------------------------------------- 42 43 // For loop nest (any shape): 44 // convert != to < or >; 45 // switch from using < or > to <= or >=. 46 // "bounds" array has to be allocated per thread. 47 // All other internal functions will work only with canonicalized loops. 48 template <typename T> 49 void kmp_canonicalize_one_loop_XX( 50 ident_t *loc, 51 /*in/out*/ bounds_infoXX_template<T> *bounds) { 52 53 if (__kmp_env_consistency_check) { 54 if (bounds->step == 0) { 55 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 56 loc); 57 } 58 } 59 60 if (bounds->comparison == comparison_t::comp_not_eq) { 61 // We can convert this to < or >, depends on the sign of the step: 62 if (bounds->step > 0) { 63 bounds->comparison = comparison_t::comp_less; 64 } else { 65 bounds->comparison = comparison_t::comp_greater; 66 } 67 } 68 69 if (bounds->comparison == comparison_t::comp_less) { 70 // Note: ub0 can be unsigned. Should be Ok to hit overflow here, 71 // because ub0 + ub1*j should be still positive (otherwise loop was not 72 // well formed) 73 bounds->ub0 -= 1; 74 bounds->comparison = comparison_t::comp_less_or_eq; 75 } else if (bounds->comparison == comparison_t::comp_greater) { 76 bounds->ub0 += 1; 77 bounds->comparison = comparison_t::comp_greater_or_eq; 78 } 79 } 80 81 // Canonicalize loop nest. original_bounds_nest is an array of length n. 82 void kmp_canonicalize_loop_nest(ident_t *loc, 83 /*in/out*/ bounds_info_t *original_bounds_nest, 84 kmp_index_t n) { 85 86 for (kmp_index_t ind = 0; ind < n; ++ind) { 87 auto bounds = &(original_bounds_nest[ind]); 88 89 switch (bounds->loop_type) { 90 case loop_type_t::loop_type_int32: 91 kmp_canonicalize_one_loop_XX<kmp_int32>( 92 loc, 93 /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds)); 94 break; 95 case loop_type_t::loop_type_uint32: 96 kmp_canonicalize_one_loop_XX<kmp_uint32>( 97 loc, 98 /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds)); 99 break; 100 case loop_type_t::loop_type_int64: 101 kmp_canonicalize_one_loop_XX<kmp_int64>( 102 loc, 103 /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds)); 104 break; 105 case loop_type_t::loop_type_uint64: 106 kmp_canonicalize_one_loop_XX<kmp_uint64>( 107 loc, 108 /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds)); 109 break; 110 default: 111 KMP_ASSERT(false); 112 } 113 } 114 } 115 116 //----------Calculating trip count on one level------------------------------- 117 118 // Calculate trip count on this loop level. 119 // We do this either for a rectangular loop nest, 120 // or after an adjustment bringing the loops to a parallelepiped shape. 121 // This number should not depend on the value of outer IV 122 // even if the formular has lb1 and ub1. 123 // Note: for non-rectangular loops don't use span for this, it's too big. 124 125 template <typename T> 126 kmp_loop_nest_iv_t kmp_calculate_trip_count_XX( 127 /*in/out*/ bounds_infoXX_template<T> *bounds) { 128 129 if (bounds->comparison == comparison_t::comp_less_or_eq) { 130 if (bounds->ub0 < bounds->lb0) { 131 // Note: after this we don't need to calculate inner loops, 132 // but that should be an edge case: 133 bounds->trip_count = 0; 134 } else { 135 // ub - lb may exceed signed type range; we need to cast to 136 // kmp_loop_nest_iv_t anyway 137 bounds->trip_count = 138 static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) / 139 __kmp_abs(bounds->step) + 140 1; 141 } 142 } else if (bounds->comparison == comparison_t::comp_greater_or_eq) { 143 if (bounds->lb0 < bounds->ub0) { 144 // Note: after this we don't need to calculate inner loops, 145 // but that should be an edge case: 146 bounds->trip_count = 0; 147 } else { 148 // lb - ub may exceed signed type range; we need to cast to 149 // kmp_loop_nest_iv_t anyway 150 bounds->trip_count = 151 static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) / 152 __kmp_abs(bounds->step) + 153 1; 154 } 155 } else { 156 KMP_ASSERT(false); 157 } 158 return bounds->trip_count; 159 } 160 161 // Calculate trip count on this loop level. 162 kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) { 163 164 kmp_loop_nest_iv_t trip_count = 0; 165 166 switch (bounds->loop_type) { 167 case loop_type_t::loop_type_int32: 168 trip_count = kmp_calculate_trip_count_XX<kmp_int32>( 169 /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds)); 170 break; 171 case loop_type_t::loop_type_uint32: 172 trip_count = kmp_calculate_trip_count_XX<kmp_uint32>( 173 /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds)); 174 break; 175 case loop_type_t::loop_type_int64: 176 trip_count = kmp_calculate_trip_count_XX<kmp_int64>( 177 /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds)); 178 break; 179 case loop_type_t::loop_type_uint64: 180 trip_count = kmp_calculate_trip_count_XX<kmp_uint64>( 181 /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds)); 182 break; 183 default: 184 KMP_ASSERT(false); 185 } 186 187 return trip_count; 188 } 189 190 //----------Trim original iv according to its type---------------------------- 191 192 // Trim original iv according to its type. 193 // Return kmp_uint64 value which can be easily used in all internal calculations 194 // And can be statically cast back to original type in user code. 195 kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) { 196 kmp_uint64 res = 0; 197 198 switch (loop_iv_type) { 199 case loop_type_t::loop_type_int8: 200 res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv)); 201 break; 202 case loop_type_t::loop_type_uint8: 203 res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv)); 204 break; 205 case loop_type_t::loop_type_int16: 206 res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv)); 207 break; 208 case loop_type_t::loop_type_uint16: 209 res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv)); 210 break; 211 case loop_type_t::loop_type_int32: 212 res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv)); 213 break; 214 case loop_type_t::loop_type_uint32: 215 res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv)); 216 break; 217 case loop_type_t::loop_type_int64: 218 res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv)); 219 break; 220 case loop_type_t::loop_type_uint64: 221 res = static_cast<kmp_uint64>(original_iv); 222 break; 223 default: 224 KMP_ASSERT(false); 225 } 226 227 return res; 228 } 229 230 //----------Compare two IVs (remember they have a type)----------------------- 231 232 bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1, 233 kmp_uint64 original_iv2) { 234 bool res = false; 235 236 switch (loop_iv_type) { 237 case loop_type_t::loop_type_int8: 238 res = static_cast<kmp_int8>(original_iv1) == 239 static_cast<kmp_int8>(original_iv2); 240 break; 241 case loop_type_t::loop_type_uint8: 242 res = static_cast<kmp_uint8>(original_iv1) == 243 static_cast<kmp_uint8>(original_iv2); 244 break; 245 case loop_type_t::loop_type_int16: 246 res = static_cast<kmp_int16>(original_iv1) == 247 static_cast<kmp_int16>(original_iv2); 248 break; 249 case loop_type_t::loop_type_uint16: 250 res = static_cast<kmp_uint16>(original_iv1) == 251 static_cast<kmp_uint16>(original_iv2); 252 break; 253 case loop_type_t::loop_type_int32: 254 res = static_cast<kmp_int32>(original_iv1) == 255 static_cast<kmp_int32>(original_iv2); 256 break; 257 case loop_type_t::loop_type_uint32: 258 res = static_cast<kmp_uint32>(original_iv1) == 259 static_cast<kmp_uint32>(original_iv2); 260 break; 261 case loop_type_t::loop_type_int64: 262 res = static_cast<kmp_int64>(original_iv1) == 263 static_cast<kmp_int64>(original_iv2); 264 break; 265 case loop_type_t::loop_type_uint64: 266 res = static_cast<kmp_uint64>(original_iv1) == 267 static_cast<kmp_uint64>(original_iv2); 268 break; 269 default: 270 KMP_ASSERT(false); 271 } 272 273 return res; 274 } 275 276 //----------Calculate original iv on one level-------------------------------- 277 278 // Return true if the point fits into upper bounds on this level, 279 // false otherwise 280 template <typename T> 281 bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds, 282 const kmp_point_t original_ivs, 283 kmp_index_t ind) { 284 285 T iv = static_cast<T>(original_ivs[ind]); 286 T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); 287 288 if (((bounds->comparison == comparison_t::comp_less_or_eq) && 289 (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) || 290 ((bounds->comparison == comparison_t::comp_greater_or_eq) && 291 (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) { 292 // The calculated point is outside of loop upper boundary: 293 return false; 294 } 295 296 return true; 297 } 298 299 // Calculate one iv corresponding to iteration on the level ind. 300 // Return true if it fits into lower-upper bounds on this level 301 // (if not, we need to re-calculate) 302 template <typename T> 303 bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds, 304 /*in/out*/ kmp_point_t original_ivs, 305 const kmp_iterations_t iterations, kmp_index_t ind, 306 bool start_with_lower_bound, bool checkBounds) { 307 308 kmp_uint64 temp = 0; 309 T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); 310 311 if (start_with_lower_bound) { 312 // we moved to the next iteration on one of outer loops, should start 313 // with the lower bound here: 314 temp = bounds->lb0 + bounds->lb1 * outer_iv; 315 } else { 316 auto iteration = iterations[ind]; 317 temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step; 318 } 319 320 // Now trim original iv according to its type: 321 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); 322 323 if (checkBounds) { 324 return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind); 325 } else { 326 return true; 327 } 328 } 329 330 bool kmp_calc_one_iv(const bounds_info_t *bounds, 331 /*in/out*/ kmp_point_t original_ivs, 332 const kmp_iterations_t iterations, kmp_index_t ind, 333 bool start_with_lower_bound, bool checkBounds) { 334 335 switch (bounds->loop_type) { 336 case loop_type_t::loop_type_int32: 337 return kmp_calc_one_iv_XX<kmp_int32>( 338 (bounds_infoXX_template<kmp_int32> *)(bounds), 339 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, 340 checkBounds); 341 break; 342 case loop_type_t::loop_type_uint32: 343 return kmp_calc_one_iv_XX<kmp_uint32>( 344 (bounds_infoXX_template<kmp_uint32> *)(bounds), 345 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, 346 checkBounds); 347 break; 348 case loop_type_t::loop_type_int64: 349 return kmp_calc_one_iv_XX<kmp_int64>( 350 (bounds_infoXX_template<kmp_int64> *)(bounds), 351 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, 352 checkBounds); 353 break; 354 case loop_type_t::loop_type_uint64: 355 return kmp_calc_one_iv_XX<kmp_uint64>( 356 (bounds_infoXX_template<kmp_uint64> *)(bounds), 357 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, 358 checkBounds); 359 break; 360 default: 361 KMP_ASSERT(false); 362 return false; 363 } 364 } 365 366 //----------Calculate original iv on one level for rectangular loop nest------ 367 368 // Calculate one iv corresponding to iteration on the level ind. 369 // Return true if it fits into lower-upper bounds on this level 370 // (if not, we need to re-calculate) 371 template <typename T> 372 void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds, 373 /*in/out*/ kmp_uint64 *original_ivs, 374 const kmp_iterations_t iterations, 375 kmp_index_t ind) { 376 377 auto iteration = iterations[ind]; 378 379 kmp_uint64 temp = 380 bounds->lb0 + 381 bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) + 382 iteration * bounds->step; 383 384 // Now trim original iv according to its type: 385 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); 386 } 387 388 void kmp_calc_one_iv_rectang(const bounds_info_t *bounds, 389 /*in/out*/ kmp_uint64 *original_ivs, 390 const kmp_iterations_t iterations, 391 kmp_index_t ind) { 392 393 switch (bounds->loop_type) { 394 case loop_type_t::loop_type_int32: 395 kmp_calc_one_iv_rectang_XX<kmp_int32>( 396 (bounds_infoXX_template<kmp_int32> *)(bounds), 397 /*in/out*/ original_ivs, iterations, ind); 398 break; 399 case loop_type_t::loop_type_uint32: 400 kmp_calc_one_iv_rectang_XX<kmp_uint32>( 401 (bounds_infoXX_template<kmp_uint32> *)(bounds), 402 /*in/out*/ original_ivs, iterations, ind); 403 break; 404 case loop_type_t::loop_type_int64: 405 kmp_calc_one_iv_rectang_XX<kmp_int64>( 406 (bounds_infoXX_template<kmp_int64> *)(bounds), 407 /*in/out*/ original_ivs, iterations, ind); 408 break; 409 case loop_type_t::loop_type_uint64: 410 kmp_calc_one_iv_rectang_XX<kmp_uint64>( 411 (bounds_infoXX_template<kmp_uint64> *)(bounds), 412 /*in/out*/ original_ivs, iterations, ind); 413 break; 414 default: 415 KMP_ASSERT(false); 416 } 417 } 418 419 //---------------------------------------------------------------------------- 420 // Rectangular loop nest 421 //---------------------------------------------------------------------------- 422 423 //----------Canonicalize loop nest and calculate trip count------------------- 424 425 // Canonicalize loop nest and calculate overall trip count. 426 // "bounds_nest" has to be allocated per thread. 427 // API will modify original bounds_nest array to bring it to a canonical form 428 // (only <= and >=, no !=, <, >). If the original loop nest was already in a 429 // canonical form there will be no changes to bounds in bounds_nest array 430 // (only trip counts will be calculated). 431 // Returns trip count of overall space. 432 extern "C" kmp_loop_nest_iv_t 433 __kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid, 434 /*in/out*/ bounds_info_t *original_bounds_nest, 435 kmp_index_t n) { 436 437 kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n); 438 439 kmp_loop_nest_iv_t total = 1; 440 441 for (kmp_index_t ind = 0; ind < n; ++ind) { 442 auto bounds = &(original_bounds_nest[ind]); 443 444 kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds); 445 total *= trip_count; 446 } 447 448 return total; 449 } 450 451 //----------Calculate old induction variables--------------------------------- 452 453 // Calculate old induction variables corresponding to overall new_iv. 454 // Note: original IV will be returned as if it had kmp_uint64 type, 455 // will have to be converted to original type in user code. 456 // Note: trip counts should be already calculated by 457 // __kmpc_process_loop_nest_rectang. 458 // OMPTODO: special case 2, 3 nested loops: either do different 459 // interface without array or possibly template this over n 460 extern "C" void 461 __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv, 462 const bounds_info_t *original_bounds_nest, 463 /*out*/ kmp_uint64 *original_ivs, 464 kmp_index_t n) { 465 466 kmp_iterations_t iterations = 467 (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n); 468 469 // First, calc corresponding iteration in every original loop: 470 for (kmp_index_t ind = n; ind > 0;) { 471 --ind; 472 auto bounds = &(original_bounds_nest[ind]); 473 474 // should be optimized to OPDIVREM: 475 auto temp = new_iv / bounds->trip_count; 476 auto iteration = new_iv % bounds->trip_count; 477 new_iv = temp; 478 479 iterations[ind] = iteration; 480 } 481 KMP_ASSERT(new_iv == 0); 482 483 for (kmp_index_t ind = 0; ind < n; ++ind) { 484 auto bounds = &(original_bounds_nest[ind]); 485 486 kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind); 487 } 488 __kmp_free(iterations); 489 } 490 491 //---------------------------------------------------------------------------- 492 // Non-rectangular loop nest 493 //---------------------------------------------------------------------------- 494 495 //----------Calculate maximum possible span of iv values on one level--------- 496 497 // Calculate span for IV on this loop level for "<=" case. 498 // Note: it's for <= on this loop nest level, so lower bound should be smallest 499 // value, upper bound should be the biggest value. If the loop won't execute, 500 // 'smallest' may be bigger than 'biggest', but we'd better not switch them 501 // around. 502 template <typename T> 503 void kmp_calc_span_lessoreq_XX( 504 /* in/out*/ bounds_info_internalXX_template<T> *bounds, 505 /* in/out*/ bounds_info_internal_t *bounds_nest) { 506 507 typedef typename traits_t<T>::unsigned_t UT; 508 // typedef typename traits_t<T>::signed_t ST; 509 510 // typedef typename big_span_t span_t; 511 typedef T span_t; 512 513 auto &bbounds = bounds->b; 514 515 if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) { 516 // This dimention depends on one of previous ones; can't be the outermost 517 // one. 518 bounds_info_internalXX_template<T> *previous = 519 reinterpret_cast<bounds_info_internalXX_template<T> *>( 520 &(bounds_nest[bbounds.outer_iv])); 521 522 // OMPTODO: assert that T is compatible with loop variable type on 523 // 'previous' loop 524 525 { 526 span_t bound_candidate1 = 527 bbounds.lb0 + bbounds.lb1 * previous->span_smallest; 528 span_t bound_candidate2 = 529 bbounds.lb0 + bbounds.lb1 * previous->span_biggest; 530 if (bound_candidate1 < bound_candidate2) { 531 bounds->span_smallest = bound_candidate1; 532 } else { 533 bounds->span_smallest = bound_candidate2; 534 } 535 } 536 537 { 538 // We can't adjust the upper bound with respect to step, because 539 // lower bound might be off after adjustments 540 541 span_t bound_candidate1 = 542 bbounds.ub0 + bbounds.ub1 * previous->span_smallest; 543 span_t bound_candidate2 = 544 bbounds.ub0 + bbounds.ub1 * previous->span_biggest; 545 if (bound_candidate1 < bound_candidate2) { 546 bounds->span_biggest = bound_candidate2; 547 } else { 548 bounds->span_biggest = bound_candidate1; 549 } 550 } 551 } else { 552 // Rectangular: 553 bounds->span_smallest = bbounds.lb0; 554 bounds->span_biggest = bbounds.ub0; 555 } 556 if (!bounds->loop_bounds_adjusted) { 557 // Here it's safe to reduce the space to the multiply of step. 558 // OMPTODO: check if the formular is correct. 559 // Also check if it would be safe to do this if we didn't adjust left side. 560 bounds->span_biggest -= 561 (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs? 562 } 563 } 564 565 // Calculate span for IV on this loop level for ">=" case. 566 template <typename T> 567 void kmp_calc_span_greateroreq_XX( 568 /* in/out*/ bounds_info_internalXX_template<T> *bounds, 569 /* in/out*/ bounds_info_internal_t *bounds_nest) { 570 571 typedef typename traits_t<T>::unsigned_t UT; 572 // typedef typename traits_t<T>::signed_t ST; 573 574 // typedef typename big_span_t span_t; 575 typedef T span_t; 576 577 auto &bbounds = bounds->b; 578 579 if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) { 580 // This dimention depends on one of previous ones; can't be the outermost 581 // one. 582 bounds_info_internalXX_template<T> *previous = 583 reinterpret_cast<bounds_info_internalXX_template<T> *>( 584 &(bounds_nest[bbounds.outer_iv])); 585 586 // OMPTODO: assert that T is compatible with loop variable type on 587 // 'previous' loop 588 589 { 590 span_t bound_candidate1 = 591 bbounds.lb0 + bbounds.lb1 * previous->span_smallest; 592 span_t bound_candidate2 = 593 bbounds.lb0 + bbounds.lb1 * previous->span_biggest; 594 if (bound_candidate1 >= bound_candidate2) { 595 bounds->span_smallest = bound_candidate1; 596 } else { 597 bounds->span_smallest = bound_candidate2; 598 } 599 } 600 601 { 602 // We can't adjust the upper bound with respect to step, because 603 // lower bound might be off after adjustments 604 605 span_t bound_candidate1 = 606 bbounds.ub0 + bbounds.ub1 * previous->span_smallest; 607 span_t bound_candidate2 = 608 bbounds.ub0 + bbounds.ub1 * previous->span_biggest; 609 if (bound_candidate1 >= bound_candidate2) { 610 bounds->span_biggest = bound_candidate2; 611 } else { 612 bounds->span_biggest = bound_candidate1; 613 } 614 } 615 616 } else { 617 // Rectangular: 618 bounds->span_biggest = bbounds.lb0; 619 bounds->span_smallest = bbounds.ub0; 620 } 621 if (!bounds->loop_bounds_adjusted) { 622 // Here it's safe to reduce the space to the multiply of step. 623 // OMPTODO: check if the formular is correct. 624 // Also check if it would be safe to do this if we didn't adjust left side. 625 bounds->span_biggest -= 626 (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs? 627 } 628 } 629 630 // Calculate maximum possible span for IV on this loop level. 631 template <typename T> 632 void kmp_calc_span_XX( 633 /* in/out*/ bounds_info_internalXX_template<T> *bounds, 634 /* in/out*/ bounds_info_internal_t *bounds_nest) { 635 636 if (bounds->b.comparison == comparison_t::comp_less_or_eq) { 637 kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); 638 } else { 639 KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq); 640 kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); 641 } 642 } 643 644 //----------All initial processing of the loop nest--------------------------- 645 646 // Calculate new bounds for this loop level. 647 // To be able to work with the nest we need to get it to a parallelepiped shape. 648 // We need to stay in the original range of values, so that there will be no 649 // overflow, for that we'll adjust both upper and lower bounds as needed. 650 template <typename T> 651 void kmp_calc_new_bounds_XX( 652 /* in/out*/ bounds_info_internalXX_template<T> *bounds, 653 /* in/out*/ bounds_info_internal_t *bounds_nest) { 654 655 auto &bbounds = bounds->b; 656 657 if (bbounds.lb1 == bbounds.ub1) { 658 // Already parallel, no need to adjust: 659 bounds->loop_bounds_adjusted = false; 660 } else { 661 bounds->loop_bounds_adjusted = true; 662 663 T old_lb1 = bbounds.lb1; 664 T old_ub1 = bbounds.ub1; 665 666 if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) { 667 // With this shape we can adjust to a rectangle: 668 bbounds.lb1 = 0; 669 bbounds.ub1 = 0; 670 } else { 671 // get upper and lower bounds to be parallel 672 // with values in the old range. 673 // Note: abs didn't work here. 674 if (((old_lb1 < 0) && (old_lb1 < old_ub1)) || 675 ((old_lb1 > 0) && (old_lb1 > old_ub1))) { 676 bbounds.lb1 = old_ub1; 677 } else { 678 bbounds.ub1 = old_lb1; 679 } 680 } 681 682 // Now need to adjust lb0, ub0, otherwise in some cases space will shrink. 683 // The idea here that for this IV we are now getting the same span 684 // irrespective of the previous IV value. 685 bounds_info_internalXX_template<T> *previous = 686 reinterpret_cast<bounds_info_internalXX_template<T> *>( 687 &bounds_nest[bbounds.outer_iv]); 688 689 if (bbounds.comparison == comparison_t::comp_less_or_eq) { 690 if (old_lb1 < bbounds.lb1) { 691 KMP_ASSERT(old_lb1 < 0); 692 // The length is good on outer_iv biggest number, 693 // can use it to find where to move the lower bound: 694 695 T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest; 696 bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space? 697 // e.g. it was 0?? (same below) 698 } else if (old_lb1 > bbounds.lb1) { 699 // still need to move lower bound: 700 T add = (old_lb1 - bbounds.lb1) * previous->span_smallest; 701 bbounds.lb0 += add; 702 } 703 704 if (old_ub1 > bbounds.ub1) { 705 KMP_ASSERT(old_ub1 > 0); 706 // The length is good on outer_iv biggest number, 707 // can use it to find where to move upper bound: 708 709 T add = (old_ub1 - bbounds.ub1) * previous->span_biggest; 710 bbounds.ub0 += add; 711 } else if (old_ub1 < bbounds.ub1) { 712 // still need to move upper bound: 713 T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest; 714 bbounds.ub0 -= sub; 715 } 716 } else { 717 KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq); 718 if (old_lb1 < bbounds.lb1) { 719 KMP_ASSERT(old_lb1 < 0); 720 T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest; 721 bbounds.lb0 -= sub; 722 } else if (old_lb1 > bbounds.lb1) { 723 T add = (old_lb1 - bbounds.lb1) * previous->span_biggest; 724 bbounds.lb0 += add; 725 } 726 727 if (old_ub1 > bbounds.ub1) { 728 KMP_ASSERT(old_ub1 > 0); 729 T add = (old_ub1 - bbounds.ub1) * previous->span_smallest; 730 bbounds.ub0 += add; 731 } else if (old_ub1 < bbounds.ub1) { 732 T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest; 733 bbounds.ub0 -= sub; 734 } 735 } 736 } 737 } 738 739 // Do all processing for one canonicalized loop in the nest 740 // (assuming that outer loops already were processed): 741 template <typename T> 742 kmp_loop_nest_iv_t kmp_process_one_loop_XX( 743 /* in/out*/ bounds_info_internalXX_template<T> *bounds, 744 /*in/out*/ bounds_info_internal_t *bounds_nest) { 745 746 kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); 747 kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); 748 return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b)); 749 } 750 751 // Non-rectangular loop nest, canonicalized to use <= or >=. 752 // Process loop nest to have a parallelepiped shape, 753 // calculate biggest spans for IV's on all levels and calculate overall trip 754 // count. "bounds_nest" has to be allocated per thread. 755 // Returns overall trip count (for adjusted space). 756 kmp_loop_nest_iv_t kmp_process_loop_nest( 757 /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) { 758 759 kmp_loop_nest_iv_t total = 1; 760 761 for (kmp_index_t ind = 0; ind < n; ++ind) { 762 auto bounds = &(bounds_nest[ind]); 763 kmp_loop_nest_iv_t trip_count = 0; 764 765 switch (bounds->b.loop_type) { 766 case loop_type_t::loop_type_int32: 767 trip_count = kmp_process_one_loop_XX<kmp_int32>( 768 /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds), 769 /*in/out*/ bounds_nest); 770 break; 771 case loop_type_t::loop_type_uint32: 772 trip_count = kmp_process_one_loop_XX<kmp_uint32>( 773 /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds), 774 /*in/out*/ bounds_nest); 775 break; 776 case loop_type_t::loop_type_int64: 777 trip_count = kmp_process_one_loop_XX<kmp_int64>( 778 /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds), 779 /*in/out*/ bounds_nest); 780 break; 781 case loop_type_t::loop_type_uint64: 782 trip_count = kmp_process_one_loop_XX<kmp_uint64>( 783 /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds), 784 /*in/out*/ bounds_nest); 785 break; 786 default: 787 KMP_ASSERT(false); 788 } 789 total *= trip_count; 790 } 791 792 return total; 793 } 794 795 //----------Calculate iterations (in the original or updated space)----------- 796 797 // Calculate number of iterations in original or updated space resulting in 798 // original_ivs[ind] (only on this level, non-negative) 799 // (not counting initial iteration) 800 template <typename T> 801 kmp_loop_nest_iv_t 802 kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds, 803 const kmp_point_t original_ivs, 804 kmp_index_t ind) { 805 806 kmp_loop_nest_iv_t iterations = 0; 807 808 if (bounds->comparison == comparison_t::comp_less_or_eq) { 809 iterations = 810 (static_cast<T>(original_ivs[ind]) - bounds->lb0 - 811 bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) / 812 __kmp_abs(bounds->step); 813 } else { 814 KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq); 815 iterations = (bounds->lb0 + 816 bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) - 817 static_cast<T>(original_ivs[ind])) / 818 __kmp_abs(bounds->step); 819 } 820 821 return iterations; 822 } 823 824 // Calculate number of iterations in the original or updated space resulting in 825 // original_ivs[ind] (only on this level, non-negative) 826 kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds, 827 const kmp_point_t original_ivs, 828 kmp_index_t ind) { 829 830 switch (bounds->loop_type) { 831 case loop_type_t::loop_type_int32: 832 return kmp_calc_number_of_iterations_XX<kmp_int32>( 833 (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind); 834 break; 835 case loop_type_t::loop_type_uint32: 836 return kmp_calc_number_of_iterations_XX<kmp_uint32>( 837 (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind); 838 break; 839 case loop_type_t::loop_type_int64: 840 return kmp_calc_number_of_iterations_XX<kmp_int64>( 841 (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind); 842 break; 843 case loop_type_t::loop_type_uint64: 844 return kmp_calc_number_of_iterations_XX<kmp_uint64>( 845 (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind); 846 break; 847 default: 848 KMP_ASSERT(false); 849 return 0; 850 } 851 } 852 853 //----------Calculate new iv corresponding to original ivs-------------------- 854 855 // We got a point in the original loop nest. 856 // Take updated bounds and calculate what new_iv will correspond to this point. 857 // When we are getting original IVs from new_iv, we have to adjust to fit into 858 // original loops bounds. Getting new_iv for the adjusted original IVs will help 859 // with making more chunks non-empty. 860 kmp_loop_nest_iv_t 861 kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest, 862 const kmp_point_t original_ivs, 863 kmp_index_t n) { 864 865 kmp_loop_nest_iv_t new_iv = 0; 866 867 for (kmp_index_t ind = 0; ind < n; ++ind) { 868 auto bounds = &(bounds_nest[ind].b); 869 870 new_iv = new_iv * bounds->trip_count + 871 kmp_calc_number_of_iterations(bounds, original_ivs, ind); 872 } 873 874 return new_iv; 875 } 876 877 //----------Calculate original ivs for provided iterations-------------------- 878 879 // Calculate original IVs for provided iterations, assuming iterations are 880 // calculated in the original space. 881 // Loop nest is in canonical form (with <= / >=). 882 bool kmp_calc_original_ivs_from_iterations( 883 const bounds_info_t *original_bounds_nest, kmp_index_t n, 884 /*in/out*/ kmp_point_t original_ivs, 885 /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) { 886 887 kmp_index_t lengthened_ind = n; 888 889 for (; ind < n;) { 890 auto bounds = &(original_bounds_nest[ind]); 891 bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations, 892 ind, (lengthened_ind < ind), true); 893 894 if (!good) { 895 // The calculated iv value is too big (or too small for >=): 896 if (ind == 0) { 897 // Space is empty: 898 return false; 899 } else { 900 // Go to next iteration on the outer loop: 901 --ind; 902 ++iterations[ind]; 903 lengthened_ind = ind; 904 for (kmp_index_t i = ind + 1; i < n; ++i) { 905 iterations[i] = 0; 906 } 907 continue; 908 } 909 } 910 ++ind; 911 } 912 913 return true; 914 } 915 916 //----------Calculate original ivs for the beginning of the loop nest--------- 917 918 // Calculate IVs for the beginning of the loop nest. 919 // Note: lower bounds of all loops may not work - 920 // if on some of the iterations of the outer loops inner loops are empty. 921 // Loop nest is in canonical form (with <= / >=). 922 bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest, 923 kmp_index_t n, 924 /*out*/ kmp_point_t original_ivs) { 925 926 // Iterations in the original space, multiplied by step: 927 kmp_iterations_t iterations = 928 (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n); 929 930 for (kmp_index_t ind = n; ind > 0;) { 931 --ind; 932 iterations[ind] = 0; 933 } 934 935 // Now calculate the point: 936 bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n, 937 /*in/out*/ original_ivs, 938 /*in/out*/ iterations, 0); 939 __kmp_free(iterations); 940 return b; 941 } 942 943 //----------Calculate next point in the original loop space------------------- 944 945 // From current set of original IVs calculate next point. 946 // Return false if there is no next point in the loop bounds. 947 bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest, 948 kmp_index_t n, const kmp_point_t original_ivs, 949 /*out*/ kmp_point_t next_original_ivs) { 950 // Iterations in the original space, multiplied by step (so can be negative): 951 kmp_iterations_t iterations = 952 (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n); 953 954 // First, calc corresponding iteration in every original loop: 955 for (kmp_index_t ind = 0; ind < n; ++ind) { 956 auto bounds = &(original_bounds_nest[ind]); 957 iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind); 958 } 959 960 for (kmp_index_t ind = 0; ind < n; ++ind) { 961 next_original_ivs[ind] = original_ivs[ind]; 962 } 963 964 // Next add one step to the iterations on the inner-most level, and see if we 965 // need to move up the nest: 966 kmp_index_t ind = n - 1; 967 ++iterations[ind]; 968 969 bool b = kmp_calc_original_ivs_from_iterations( 970 original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind); 971 972 __kmp_free(iterations); 973 return b; 974 } 975 976 //----------Calculate chunk end in the original loop space-------------------- 977 978 // For one level calculate old induction variable corresponding to overall 979 // new_iv for the chunk end. 980 // Return true if it fits into upper bound on this level 981 // (if not, we need to re-calculate) 982 template <typename T> 983 bool kmp_calc_one_iv_for_chunk_end_XX( 984 const bounds_infoXX_template<T> *bounds, 985 const bounds_infoXX_template<T> *updated_bounds, 986 /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations, 987 kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start, 988 const kmp_point_t original_ivs_start) { 989 990 // typedef std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64> 991 // big_span_t; 992 993 // OMPTODO: is it good enough, or do we need ST or do we need big_span_t? 994 T temp = 0; 995 996 T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); 997 998 if (start_with_lower_bound) { 999 // we moved to the next iteration on one of outer loops, may as well use 1000 // the lower bound here: 1001 temp = bounds->lb0 + bounds->lb1 * outer_iv; 1002 } else { 1003 // Start in expanded space, but: 1004 // - we need to hit original space lower bound, so need to account for 1005 // that 1006 // - we have to go into original space, even if that means adding more 1007 // iterations than was planned 1008 // - we have to go past (or equal to) previous point (which is the chunk 1009 // starting point) 1010 1011 auto iteration = iterations[ind]; 1012 1013 auto step = bounds->step; 1014 1015 // In case of >= it's negative: 1016 auto accountForStep = 1017 ((bounds->lb0 + bounds->lb1 * outer_iv) - 1018 (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) % 1019 step; 1020 1021 temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv + 1022 accountForStep + iteration * step; 1023 1024 if (((bounds->comparison == comparison_t::comp_less_or_eq) && 1025 (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) || 1026 ((bounds->comparison == comparison_t::comp_greater_or_eq) && 1027 (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) { 1028 // Too small (or too big), didn't reach the original lower bound. Use 1029 // heuristic: 1030 temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step; 1031 } 1032 1033 if (compare_with_start) { 1034 1035 T start = static_cast<T>(original_ivs_start[ind]); 1036 1037 temp = kmp_fix_iv(bounds->loop_iv_type, temp); 1038 1039 // On all previous levels start of the chunk is same as the end, need to 1040 // be really careful here: 1041 if (((bounds->comparison == comparison_t::comp_less_or_eq) && 1042 (temp < start)) || 1043 ((bounds->comparison == comparison_t::comp_greater_or_eq) && 1044 (temp > start))) { 1045 // End of the chunk can't be smaller (for >= bigger) than it's start. 1046 // Use heuristic: 1047 temp = start + iteration / 4 * step; 1048 } 1049 } 1050 } 1051 1052 original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp); 1053 1054 if (((bounds->comparison == comparison_t::comp_less_or_eq) && 1055 (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) || 1056 ((bounds->comparison == comparison_t::comp_greater_or_eq) && 1057 (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) { 1058 // Too big (or too small for >=). 1059 return false; 1060 } 1061 1062 return true; 1063 } 1064 1065 // For one level calculate old induction variable corresponding to overall 1066 // new_iv for the chunk end. 1067 bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds, 1068 const bounds_info_t *updated_bounds, 1069 /*in/out*/ kmp_point_t original_ivs, 1070 const kmp_iterations_t iterations, 1071 kmp_index_t ind, bool start_with_lower_bound, 1072 bool compare_with_start, 1073 const kmp_point_t original_ivs_start) { 1074 1075 switch (bounds->loop_type) { 1076 case loop_type_t::loop_type_int32: 1077 return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>( 1078 (bounds_infoXX_template<kmp_int32> *)(bounds), 1079 (bounds_infoXX_template<kmp_int32> *)(updated_bounds), 1080 /*in/out*/ 1081 original_ivs, iterations, ind, start_with_lower_bound, 1082 compare_with_start, original_ivs_start); 1083 break; 1084 case loop_type_t::loop_type_uint32: 1085 return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>( 1086 (bounds_infoXX_template<kmp_uint32> *)(bounds), 1087 (bounds_infoXX_template<kmp_uint32> *)(updated_bounds), 1088 /*in/out*/ 1089 original_ivs, iterations, ind, start_with_lower_bound, 1090 compare_with_start, original_ivs_start); 1091 break; 1092 case loop_type_t::loop_type_int64: 1093 return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>( 1094 (bounds_infoXX_template<kmp_int64> *)(bounds), 1095 (bounds_infoXX_template<kmp_int64> *)(updated_bounds), 1096 /*in/out*/ 1097 original_ivs, iterations, ind, start_with_lower_bound, 1098 compare_with_start, original_ivs_start); 1099 break; 1100 case loop_type_t::loop_type_uint64: 1101 return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>( 1102 (bounds_infoXX_template<kmp_uint64> *)(bounds), 1103 (bounds_infoXX_template<kmp_uint64> *)(updated_bounds), 1104 /*in/out*/ 1105 original_ivs, iterations, ind, start_with_lower_bound, 1106 compare_with_start, original_ivs_start); 1107 break; 1108 default: 1109 KMP_ASSERT(false); 1110 return false; 1111 } 1112 } 1113 1114 // Calculate old induction variables corresponding to overall new_iv for the 1115 // chunk end. If due to space extension we are getting old IVs outside of the 1116 // boundaries, bring them into the boundaries. Need to do this in the runtime, 1117 // esp. on the lower bounds side. When getting result need to make sure that the 1118 // new chunk starts at next position to old chunk, not overlaps with it (this is 1119 // done elsewhere), and need to make sure end of the chunk is further than the 1120 // beginning of the chunk. We don't need an exact ending point here, just 1121 // something more-or-less close to the desired chunk length, bigger is fine 1122 // (smaller would be fine, but we risk going into infinite loop, so do smaller 1123 // only at the very end of the space). result: false if could not find the 1124 // ending point in the original loop space. In this case the caller can use 1125 // original upper bounds as the end of the chunk. Chunk won't be empty, because 1126 // it'll have at least the starting point, which is by construction in the 1127 // original space. 1128 bool kmp_calc_original_ivs_for_chunk_end( 1129 const bounds_info_t *original_bounds_nest, kmp_index_t n, 1130 const bounds_info_internal_t *updated_bounds_nest, 1131 const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv, 1132 /*out*/ kmp_point_t original_ivs) { 1133 1134 // Iterations in the expanded space: 1135 kmp_iterations_t iterations = 1136 (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n); 1137 1138 // First, calc corresponding iteration in every modified loop: 1139 for (kmp_index_t ind = n; ind > 0;) { 1140 --ind; 1141 auto &updated_bounds = updated_bounds_nest[ind]; 1142 1143 // should be optimized to OPDIVREM: 1144 auto new_ind = new_iv / updated_bounds.b.trip_count; 1145 auto iteration = new_iv % updated_bounds.b.trip_count; 1146 1147 new_iv = new_ind; 1148 iterations[ind] = iteration; 1149 } 1150 KMP_DEBUG_ASSERT(new_iv == 0); 1151 1152 kmp_index_t lengthened_ind = n; 1153 kmp_index_t equal_ind = -1; 1154 1155 // Next calculate the point, but in original loop nest. 1156 for (kmp_index_t ind = 0; ind < n;) { 1157 auto bounds = &(original_bounds_nest[ind]); 1158 auto updated_bounds = &(updated_bounds_nest[ind].b); 1159 1160 bool good = kmp_calc_one_iv_for_chunk_end( 1161 bounds, updated_bounds, 1162 /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind), 1163 (equal_ind >= ind - 1), original_ivs_start); 1164 1165 if (!good) { 1166 // Too big (or too small for >=). 1167 if (ind == 0) { 1168 // Need to reduce to the end. 1169 __kmp_free(iterations); 1170 return false; 1171 } else { 1172 // Go to next iteration on outer loop: 1173 --ind; 1174 ++(iterations[ind]); 1175 lengthened_ind = ind; 1176 if (equal_ind >= lengthened_ind) { 1177 // We've changed the number of iterations here, 1178 // can't be same anymore: 1179 equal_ind = lengthened_ind - 1; 1180 } 1181 for (kmp_index_t i = ind + 1; i < n; ++i) { 1182 iterations[i] = 0; 1183 } 1184 continue; 1185 } 1186 } 1187 1188 if ((equal_ind == ind - 1) && 1189 (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind], 1190 original_ivs_start[ind]))) { 1191 equal_ind = ind; 1192 } else if ((equal_ind > ind - 1) && 1193 !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind], 1194 original_ivs_start[ind]))) { 1195 equal_ind = ind - 1; 1196 } 1197 ++ind; 1198 } 1199 1200 __kmp_free(iterations); 1201 return true; 1202 } 1203 1204 //----------Calculate upper bounds for the last chunk------------------------- 1205 1206 // Calculate one upper bound for the end. 1207 template <typename T> 1208 void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds, 1209 /*in/out*/ kmp_point_t original_ivs, 1210 kmp_index_t ind) { 1211 1212 T temp = bounds->ub0 + 1213 bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]); 1214 1215 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); 1216 } 1217 1218 void kmp_calc_one_iv_end(const bounds_info_t *bounds, 1219 /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) { 1220 1221 switch (bounds->loop_type) { 1222 default: 1223 KMP_ASSERT(false); 1224 break; 1225 case loop_type_t::loop_type_int32: 1226 kmp_calc_one_iv_end_XX<kmp_int32>( 1227 (bounds_infoXX_template<kmp_int32> *)(bounds), 1228 /*in/out*/ original_ivs, ind); 1229 break; 1230 case loop_type_t::loop_type_uint32: 1231 kmp_calc_one_iv_end_XX<kmp_uint32>( 1232 (bounds_infoXX_template<kmp_uint32> *)(bounds), 1233 /*in/out*/ original_ivs, ind); 1234 break; 1235 case loop_type_t::loop_type_int64: 1236 kmp_calc_one_iv_end_XX<kmp_int64>( 1237 (bounds_infoXX_template<kmp_int64> *)(bounds), 1238 /*in/out*/ original_ivs, ind); 1239 break; 1240 case loop_type_t::loop_type_uint64: 1241 kmp_calc_one_iv_end_XX<kmp_uint64>( 1242 (bounds_infoXX_template<kmp_uint64> *)(bounds), 1243 /*in/out*/ original_ivs, ind); 1244 break; 1245 } 1246 } 1247 1248 // Calculate upper bounds for the last loop iteration. Just use original upper 1249 // bounds (adjusted when canonicalized to use <= / >=). No need to check that 1250 // this point is in the original space (it's likely not) 1251 void kmp_calc_original_ivs_for_end( 1252 const bounds_info_t *const original_bounds_nest, kmp_index_t n, 1253 /*out*/ kmp_point_t original_ivs) { 1254 for (kmp_index_t ind = 0; ind < n; ++ind) { 1255 auto bounds = &(original_bounds_nest[ind]); 1256 kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind); 1257 } 1258 } 1259 1260 //----------Init API for non-rectangular loops-------------------------------- 1261 1262 // Init API for collapsed loops (static, no chunks defined). 1263 // "bounds_nest" has to be allocated per thread. 1264 // API will modify original bounds_nest array to bring it to a canonical form 1265 // (only <= and >=, no !=, <, >). If the original loop nest was already in a 1266 // canonical form there will be no changes to bounds in bounds_nest array 1267 // (only trip counts will be calculated). Internally API will expand the space 1268 // to parallelogram/parallelepiped, calculate total, calculate bounds for the 1269 // chunks in terms of the new IV, re-calc them in terms of old IVs (especially 1270 // important on the left side, to hit the lower bounds and not step over), and 1271 // pick the correct chunk for this thread (so it will calculate chunks up to the 1272 // needed one). It could be optimized to calculate just this chunk, potentially 1273 // a bit less well distributed among threads. It is designed to make sure that 1274 // threads will receive predictable chunks, deterministically (so that next nest 1275 // of loops with similar characteristics will get exactly same chunks on same 1276 // threads). 1277 // Current contract: chunk_bounds_nest has only lb0 and ub0, 1278 // lb1 and ub1 are set to 0 and can be ignored. (This may change in the future). 1279 extern "C" kmp_int32 1280 __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, 1281 /*in/out*/ bounds_info_t *original_bounds_nest, 1282 /*out*/ bounds_info_t *chunk_bounds_nest, 1283 kmp_index_t n, /*out*/ kmp_int32 *plastiter) { 1284 1285 KMP_DEBUG_ASSERT(plastiter && original_bounds_nest); 1286 KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid)); 1287 1288 if (__kmp_env_consistency_check) { 1289 __kmp_push_workshare(gtid, ct_pdo, loc); 1290 } 1291 1292 kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n); 1293 1294 bounds_info_internal_t *updated_bounds_nest = 1295 (bounds_info_internal_t *)__kmp_allocate(sizeof(bounds_info_internal_t) * 1296 n); 1297 1298 for (kmp_index_t i = 0; i < n; ++i) { 1299 updated_bounds_nest[i].b = original_bounds_nest[i]; 1300 } 1301 1302 kmp_loop_nest_iv_t total = 1303 kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n); 1304 1305 if (plastiter != NULL) { 1306 *plastiter = FALSE; 1307 } 1308 1309 if (total == 0) { 1310 // Loop won't execute: 1311 __kmp_free(updated_bounds_nest); 1312 return FALSE; 1313 } 1314 1315 // OMPTODO: DISTRIBUTE is not supported yet 1316 __kmp_assert_valid_gtid(gtid); 1317 kmp_uint32 tid = __kmp_tid_from_gtid(gtid); 1318 1319 kmp_info_t *th = __kmp_threads[gtid]; 1320 kmp_team_t *team = th->th.th_team; 1321 kmp_uint32 nth = team->t.t_nproc; // Number of threads 1322 1323 KMP_DEBUG_ASSERT(tid < nth); 1324 1325 kmp_point_t original_ivs_start = 1326 (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n); 1327 kmp_point_t original_ivs_end = 1328 (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n); 1329 kmp_point_t original_ivs_next_start = 1330 (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n); 1331 1332 if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n, 1333 /*out*/ original_ivs_start)) { 1334 // Loop won't execute: 1335 __kmp_free(updated_bounds_nest); 1336 __kmp_free(original_ivs_start); 1337 __kmp_free(original_ivs_end); 1338 __kmp_free(original_ivs_next_start); 1339 return FALSE; 1340 } 1341 1342 // Not doing this optimization for one thread: 1343 // (1) more to test 1344 // (2) without it current contract that chunk_bounds_nest has only lb0 and 1345 // ub0, lb1 and ub1 are set to 0 and can be ignored. 1346 // if (nth == 1) { 1347 // // One thread: 1348 // // Copy all info from original_bounds_nest, it'll be good enough. 1349 1350 // for (kmp_index_t i = 0; i < n; ++i) { 1351 // chunk_bounds_nest[i] = original_bounds_nest[i]; 1352 // } 1353 1354 // if (plastiter != NULL) { 1355 // *plastiter = TRUE; 1356 // } 1357 // __kmp_free(updated_bounds_nest); 1358 // __kmp_free(original_ivs_start); 1359 // __kmp_free(original_ivs_end); 1360 // __kmp_free(original_ivs_next_start); 1361 // return TRUE; 1362 //} 1363 1364 kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs( 1365 updated_bounds_nest, original_ivs_start, n); 1366 1367 bool last_iter = false; 1368 1369 for (; nth > 0;) { 1370 // We could calculate chunk size once, but this is to compensate that the 1371 // original space is not parallelepiped and some threads can be left 1372 // without work: 1373 KMP_DEBUG_ASSERT(total >= new_iv); 1374 1375 kmp_loop_nest_iv_t total_left = total - new_iv; 1376 kmp_loop_nest_iv_t chunk_size = total_left / nth; 1377 kmp_loop_nest_iv_t remainder = total_left % nth; 1378 1379 kmp_loop_nest_iv_t curr_chunk_size = chunk_size; 1380 1381 if (remainder > 0) { 1382 ++curr_chunk_size; 1383 --remainder; 1384 } 1385 1386 #if defined(KMP_DEBUG) 1387 kmp_loop_nest_iv_t new_iv_for_start = new_iv; 1388 #endif 1389 1390 if (curr_chunk_size > 1) { 1391 new_iv += curr_chunk_size - 1; 1392 } 1393 1394 if ((nth == 1) || (new_iv >= total - 1)) { 1395 // Do this one till the end - just in case we miscalculated 1396 // and either too much is left to process or new_iv is a bit too big: 1397 kmp_calc_original_ivs_for_end(original_bounds_nest, n, 1398 /*out*/ original_ivs_end); 1399 1400 last_iter = true; 1401 } else { 1402 // Note: here we make sure it's past (or equal to) the previous point. 1403 if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n, 1404 updated_bounds_nest, 1405 original_ivs_start, new_iv, 1406 /*out*/ original_ivs_end)) { 1407 // We could not find the ending point, use the original upper bounds: 1408 kmp_calc_original_ivs_for_end(original_bounds_nest, n, 1409 /*out*/ original_ivs_end); 1410 1411 last_iter = true; 1412 } 1413 } 1414 1415 #if defined(KMP_DEBUG) 1416 auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs( 1417 updated_bounds_nest, original_ivs_end, n); 1418 KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start); 1419 #endif 1420 1421 if (last_iter && (tid != 0)) { 1422 // We are done, this was last chunk, but no chunk for current thread was 1423 // found: 1424 __kmp_free(updated_bounds_nest); 1425 __kmp_free(original_ivs_start); 1426 __kmp_free(original_ivs_end); 1427 __kmp_free(original_ivs_next_start); 1428 return FALSE; 1429 } 1430 1431 if (tid == 0) { 1432 // We found the chunk for this thread, now we need to check if it's the 1433 // last chunk or not: 1434 1435 if (last_iter || 1436 !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end, 1437 /*out*/ original_ivs_next_start)) { 1438 // no more loop iterations left to process, 1439 // this means that currently found chunk is the last chunk: 1440 if (plastiter != NULL) { 1441 *plastiter = TRUE; 1442 } 1443 } 1444 1445 // Fill in chunk bounds: 1446 for (kmp_index_t i = 0; i < n; ++i) { 1447 chunk_bounds_nest[i] = 1448 original_bounds_nest[i]; // To fill in types, etc. - optional 1449 chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i]; 1450 chunk_bounds_nest[i].lb1_u64 = 0; 1451 1452 chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i]; 1453 chunk_bounds_nest[i].ub1_u64 = 0; 1454 } 1455 1456 __kmp_free(updated_bounds_nest); 1457 __kmp_free(original_ivs_start); 1458 __kmp_free(original_ivs_end); 1459 __kmp_free(original_ivs_next_start); 1460 return TRUE; 1461 } 1462 1463 --tid; 1464 --nth; 1465 1466 bool next_chunk = kmp_calc_next_original_ivs( 1467 original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start); 1468 if (!next_chunk) { 1469 // no more loop iterations to process, 1470 // the prevoius chunk was the last chunk 1471 break; 1472 } 1473 1474 // original_ivs_start is next to previous chunk original_ivs_end, 1475 // we need to start new chunk here, so chunks will be one after another 1476 // without any gap or overlap: 1477 new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest, 1478 original_ivs_start, n); 1479 } 1480 1481 __kmp_free(updated_bounds_nest); 1482 __kmp_free(original_ivs_start); 1483 __kmp_free(original_ivs_end); 1484 __kmp_free(original_ivs_next_start); 1485 return FALSE; 1486 } 1487