xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_collapse.cpp (revision 5ca8e32633c4ffbbcd6762e5888b6a4ba0708c6c)
1 /*
2  * kmp_collapse.cpp -- loop collapse feature
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_error.h"
15 #include "kmp_i18n.h"
16 #include "kmp_itt.h"
17 #include "kmp_stats.h"
18 #include "kmp_str.h"
19 #include "kmp_collapse.h"
20 
21 #if OMPT_SUPPORT
22 #include "ompt-specific.h"
23 #endif
24 
25 // OMPTODO: different style of comments (see kmp_sched)
26 // OMPTODO: OMPT/OMPD
27 
28 // avoid inadevertently using a library based abs
29 template <typename T> T __kmp_abs(const T val) {
30   return (val < 0) ? -val: val;
31 }
32 kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
33 kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
34 
35 //----------------------------------------------------------------------------
36 // Common functions for working with rectangular and non-rectangular loops
37 //----------------------------------------------------------------------------
38 
39 template <typename T> int __kmp_sign(T val) { return (T(0) < val) - (val < T(0)); }
40 
41 //----------Loop canonicalization---------------------------------------------
42 
43 // For loop nest (any shape):
44 // convert != to < or >;
45 // switch from using < or > to <= or >=.
46 // "bounds" array has to be allocated per thread.
47 // All other internal functions will work only with canonicalized loops.
48 template <typename T>
49 void kmp_canonicalize_one_loop_XX(
50     ident_t *loc,
51     /*in/out*/ bounds_infoXX_template<T> *bounds) {
52 
53   if (__kmp_env_consistency_check) {
54     if (bounds->step == 0) {
55       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
56                             loc);
57     }
58   }
59 
60   if (bounds->comparison == comparison_t::comp_not_eq) {
61     // We can convert this to < or >, depends on the sign of the step:
62     if (bounds->step > 0) {
63       bounds->comparison = comparison_t::comp_less;
64     } else {
65       bounds->comparison = comparison_t::comp_greater;
66     }
67   }
68 
69   if (bounds->comparison == comparison_t::comp_less) {
70     // Note: ub0 can be unsigned. Should be Ok to hit overflow here,
71     // because ub0 + ub1*j should be still positive (otherwise loop was not
72     // well formed)
73     bounds->ub0 -= 1;
74     bounds->comparison = comparison_t::comp_less_or_eq;
75   } else if (bounds->comparison == comparison_t::comp_greater) {
76     bounds->ub0 += 1;
77     bounds->comparison = comparison_t::comp_greater_or_eq;
78   }
79 }
80 
81 // Canonicalize loop nest. original_bounds_nest is an array of length n.
82 void kmp_canonicalize_loop_nest(ident_t *loc,
83                                 /*in/out*/ bounds_info_t *original_bounds_nest,
84                                 kmp_index_t n) {
85 
86   for (kmp_index_t ind = 0; ind < n; ++ind) {
87     auto bounds = &(original_bounds_nest[ind]);
88 
89     switch (bounds->loop_type) {
90     case loop_type_t::loop_type_int32:
91       kmp_canonicalize_one_loop_XX<kmp_int32>(
92           loc,
93           /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
94       break;
95     case loop_type_t::loop_type_uint32:
96       kmp_canonicalize_one_loop_XX<kmp_uint32>(
97           loc,
98           /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
99       break;
100     case loop_type_t::loop_type_int64:
101       kmp_canonicalize_one_loop_XX<kmp_int64>(
102           loc,
103           /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
104       break;
105     case loop_type_t::loop_type_uint64:
106       kmp_canonicalize_one_loop_XX<kmp_uint64>(
107           loc,
108           /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
109       break;
110     default:
111       KMP_ASSERT(false);
112     }
113   }
114 }
115 
116 //----------Calculating trip count on one level-------------------------------
117 
118 // Calculate trip count on this loop level.
119 // We do this either for a rectangular loop nest,
120 // or after an adjustment bringing the loops to a parallelepiped shape.
121 // This number should not depend on the value of outer IV
122 // even if the formular has lb1 and ub1.
123 // Note: for non-rectangular loops don't use span for this, it's too big.
124 
125 template <typename T>
126 kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
127     /*in/out*/ bounds_infoXX_template<T> *bounds) {
128 
129   if (bounds->comparison == comparison_t::comp_less_or_eq) {
130     if (bounds->ub0 < bounds->lb0) {
131       // Note: after this we don't need to calculate inner loops,
132       // but that should be an edge case:
133       bounds->trip_count = 0;
134     } else {
135       // ub - lb may exceed signed type range; we need to cast to
136       // kmp_loop_nest_iv_t anyway
137       bounds->trip_count =
138           static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
139               __kmp_abs(bounds->step) +
140           1;
141     }
142   } else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
143     if (bounds->lb0 < bounds->ub0) {
144       // Note: after this we don't need to calculate inner loops,
145       // but that should be an edge case:
146       bounds->trip_count = 0;
147     } else {
148       // lb - ub may exceed signed type range; we need to cast to
149       // kmp_loop_nest_iv_t anyway
150       bounds->trip_count =
151           static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
152               __kmp_abs(bounds->step) +
153           1;
154     }
155   } else {
156     KMP_ASSERT(false);
157   }
158   return bounds->trip_count;
159 }
160 
161 // Calculate trip count on this loop level.
162 kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) {
163 
164   kmp_loop_nest_iv_t trip_count = 0;
165 
166   switch (bounds->loop_type) {
167   case loop_type_t::loop_type_int32:
168     trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
169         /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
170     break;
171   case loop_type_t::loop_type_uint32:
172     trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
173         /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
174     break;
175   case loop_type_t::loop_type_int64:
176     trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
177         /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
178     break;
179   case loop_type_t::loop_type_uint64:
180     trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
181         /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
182     break;
183   default:
184     KMP_ASSERT(false);
185   }
186 
187   return trip_count;
188 }
189 
190 //----------Trim original iv according to its type----------------------------
191 
192 // Trim original iv according to its type.
193 // Return kmp_uint64 value which can be easily used in all internal calculations
194 // And can be statically cast back to original type in user code.
195 kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
196   kmp_uint64 res = 0;
197 
198   switch (loop_iv_type) {
199   case loop_type_t::loop_type_int8:
200     res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
201     break;
202   case loop_type_t::loop_type_uint8:
203     res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
204     break;
205   case loop_type_t::loop_type_int16:
206     res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
207     break;
208   case loop_type_t::loop_type_uint16:
209     res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
210     break;
211   case loop_type_t::loop_type_int32:
212     res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
213     break;
214   case loop_type_t::loop_type_uint32:
215     res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
216     break;
217   case loop_type_t::loop_type_int64:
218     res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
219     break;
220   case loop_type_t::loop_type_uint64:
221     res = static_cast<kmp_uint64>(original_iv);
222     break;
223   default:
224     KMP_ASSERT(false);
225   }
226 
227   return res;
228 }
229 
230 //----------Compare two IVs (remember they have a type)-----------------------
231 
232 bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
233                 kmp_uint64 original_iv2) {
234   bool res = false;
235 
236   switch (loop_iv_type) {
237   case loop_type_t::loop_type_int8:
238     res = static_cast<kmp_int8>(original_iv1) ==
239           static_cast<kmp_int8>(original_iv2);
240     break;
241   case loop_type_t::loop_type_uint8:
242     res = static_cast<kmp_uint8>(original_iv1) ==
243           static_cast<kmp_uint8>(original_iv2);
244     break;
245   case loop_type_t::loop_type_int16:
246     res = static_cast<kmp_int16>(original_iv1) ==
247           static_cast<kmp_int16>(original_iv2);
248     break;
249   case loop_type_t::loop_type_uint16:
250     res = static_cast<kmp_uint16>(original_iv1) ==
251           static_cast<kmp_uint16>(original_iv2);
252     break;
253   case loop_type_t::loop_type_int32:
254     res = static_cast<kmp_int32>(original_iv1) ==
255           static_cast<kmp_int32>(original_iv2);
256     break;
257   case loop_type_t::loop_type_uint32:
258     res = static_cast<kmp_uint32>(original_iv1) ==
259           static_cast<kmp_uint32>(original_iv2);
260     break;
261   case loop_type_t::loop_type_int64:
262     res = static_cast<kmp_int64>(original_iv1) ==
263           static_cast<kmp_int64>(original_iv2);
264     break;
265   case loop_type_t::loop_type_uint64:
266     res = static_cast<kmp_uint64>(original_iv1) ==
267           static_cast<kmp_uint64>(original_iv2);
268     break;
269   default:
270     KMP_ASSERT(false);
271   }
272 
273   return res;
274 }
275 
276 //----------Calculate original iv on one level--------------------------------
277 
278 // Return true if the point fits into upper bounds on this level,
279 // false otherwise
280 template <typename T>
281 bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
282                                  const kmp_point_t original_ivs,
283                                  kmp_index_t ind) {
284 
285   T iv = static_cast<T>(original_ivs[ind]);
286   T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
287 
288   if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
289        (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
290       ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
291        (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
292     // The calculated point is outside of loop upper boundary:
293     return false;
294   }
295 
296   return true;
297 }
298 
299 // Calculate one iv corresponding to iteration on the level ind.
300 // Return true if it fits into lower-upper bounds on this level
301 // (if not, we need to re-calculate)
302 template <typename T>
303 bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
304                         /*in/out*/ kmp_point_t original_ivs,
305                         const kmp_iterations_t iterations, kmp_index_t ind,
306                         bool start_with_lower_bound, bool checkBounds) {
307 
308   kmp_uint64 temp = 0;
309   T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
310 
311   if (start_with_lower_bound) {
312     // we moved to the next iteration on one of outer loops, should start
313     // with the lower bound here:
314     temp = bounds->lb0 + bounds->lb1 * outer_iv;
315   } else {
316     auto iteration = iterations[ind];
317     temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
318   }
319 
320   // Now trim original iv according to its type:
321   original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
322 
323   if (checkBounds) {
324     return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
325   } else {
326     return true;
327   }
328 }
329 
330 bool kmp_calc_one_iv(const bounds_info_t *bounds,
331                      /*in/out*/ kmp_point_t original_ivs,
332                      const kmp_iterations_t iterations, kmp_index_t ind,
333                      bool start_with_lower_bound, bool checkBounds) {
334 
335   switch (bounds->loop_type) {
336   case loop_type_t::loop_type_int32:
337     return kmp_calc_one_iv_XX<kmp_int32>(
338         (bounds_infoXX_template<kmp_int32> *)(bounds),
339         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
340         checkBounds);
341     break;
342   case loop_type_t::loop_type_uint32:
343     return kmp_calc_one_iv_XX<kmp_uint32>(
344         (bounds_infoXX_template<kmp_uint32> *)(bounds),
345         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
346         checkBounds);
347     break;
348   case loop_type_t::loop_type_int64:
349     return kmp_calc_one_iv_XX<kmp_int64>(
350         (bounds_infoXX_template<kmp_int64> *)(bounds),
351         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
352         checkBounds);
353     break;
354   case loop_type_t::loop_type_uint64:
355     return kmp_calc_one_iv_XX<kmp_uint64>(
356         (bounds_infoXX_template<kmp_uint64> *)(bounds),
357         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
358         checkBounds);
359     break;
360   default:
361     KMP_ASSERT(false);
362     return false;
363   }
364 }
365 
366 //----------Calculate original iv on one level for rectangular loop nest------
367 
368 // Calculate one iv corresponding to iteration on the level ind.
369 // Return true if it fits into lower-upper bounds on this level
370 // (if not, we need to re-calculate)
371 template <typename T>
372 void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
373                                 /*in/out*/ kmp_uint64 *original_ivs,
374                                 const kmp_iterations_t iterations,
375                                 kmp_index_t ind) {
376 
377   auto iteration = iterations[ind];
378 
379   kmp_uint64 temp =
380       bounds->lb0 +
381       bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
382       iteration * bounds->step;
383 
384   // Now trim original iv according to its type:
385   original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
386 }
387 
388 void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
389                              /*in/out*/ kmp_uint64 *original_ivs,
390                              const kmp_iterations_t iterations,
391                              kmp_index_t ind) {
392 
393   switch (bounds->loop_type) {
394   case loop_type_t::loop_type_int32:
395     kmp_calc_one_iv_rectang_XX<kmp_int32>(
396         (bounds_infoXX_template<kmp_int32> *)(bounds),
397         /*in/out*/ original_ivs, iterations, ind);
398     break;
399   case loop_type_t::loop_type_uint32:
400     kmp_calc_one_iv_rectang_XX<kmp_uint32>(
401         (bounds_infoXX_template<kmp_uint32> *)(bounds),
402         /*in/out*/ original_ivs, iterations, ind);
403     break;
404   case loop_type_t::loop_type_int64:
405     kmp_calc_one_iv_rectang_XX<kmp_int64>(
406         (bounds_infoXX_template<kmp_int64> *)(bounds),
407         /*in/out*/ original_ivs, iterations, ind);
408     break;
409   case loop_type_t::loop_type_uint64:
410     kmp_calc_one_iv_rectang_XX<kmp_uint64>(
411         (bounds_infoXX_template<kmp_uint64> *)(bounds),
412         /*in/out*/ original_ivs, iterations, ind);
413     break;
414   default:
415     KMP_ASSERT(false);
416   }
417 }
418 
419 //----------------------------------------------------------------------------
420 // Rectangular loop nest
421 //----------------------------------------------------------------------------
422 
423 //----------Canonicalize loop nest and calculate trip count-------------------
424 
425 // Canonicalize loop nest and calculate overall trip count.
426 // "bounds_nest" has to be allocated per thread.
427 // API will modify original bounds_nest array to bring it to a canonical form
428 // (only <= and >=, no !=, <, >). If the original loop nest was already in a
429 // canonical form there will be no changes to bounds in bounds_nest array
430 // (only trip counts will be calculated).
431 // Returns trip count of overall space.
432 extern "C" kmp_loop_nest_iv_t
433 __kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
434                                  /*in/out*/ bounds_info_t *original_bounds_nest,
435                                  kmp_index_t n) {
436 
437   kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
438 
439   kmp_loop_nest_iv_t total = 1;
440 
441   for (kmp_index_t ind = 0; ind < n; ++ind) {
442     auto bounds = &(original_bounds_nest[ind]);
443 
444     kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds);
445     total *= trip_count;
446   }
447 
448   return total;
449 }
450 
451 //----------Calculate old induction variables---------------------------------
452 
453 // Calculate old induction variables corresponding to overall new_iv.
454 // Note: original IV will be returned as if it had kmp_uint64 type,
455 // will have to be converted to original type in user code.
456 // Note: trip counts should be already calculated by
457 // __kmpc_process_loop_nest_rectang.
458 // OMPTODO: special case 2, 3 nested loops: either do different
459 // interface without array or possibly template this over n
460 extern "C" void
461 __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
462                                  const bounds_info_t *original_bounds_nest,
463                                  /*out*/ kmp_uint64 *original_ivs,
464                                  kmp_index_t n) {
465 
466   kmp_iterations_t iterations =
467       (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
468 
469   // First, calc corresponding iteration in every original loop:
470   for (kmp_index_t ind = n; ind > 0;) {
471     --ind;
472     auto bounds = &(original_bounds_nest[ind]);
473 
474     // should be optimized to OPDIVREM:
475     auto temp = new_iv / bounds->trip_count;
476     auto iteration = new_iv % bounds->trip_count;
477     new_iv = temp;
478 
479     iterations[ind] = iteration;
480   }
481   KMP_ASSERT(new_iv == 0);
482 
483   for (kmp_index_t ind = 0; ind < n; ++ind) {
484     auto bounds = &(original_bounds_nest[ind]);
485 
486     kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
487   }
488   __kmp_free(iterations);
489 }
490 
491 //----------------------------------------------------------------------------
492 // Non-rectangular loop nest
493 //----------------------------------------------------------------------------
494 
495 //----------Calculate maximum possible span of iv values on one level---------
496 
497 // Calculate span for IV on this loop level for "<=" case.
498 // Note: it's for <= on this loop nest level, so lower bound should be smallest
499 // value, upper bound should be the biggest value. If the loop won't execute,
500 // 'smallest' may be bigger than 'biggest', but we'd better not switch them
501 // around.
502 template <typename T>
503 void kmp_calc_span_lessoreq_XX(
504     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
505     /* in/out*/ bounds_info_internal_t *bounds_nest) {
506 
507   typedef typename traits_t<T>::unsigned_t UT;
508   // typedef typename traits_t<T>::signed_t ST;
509 
510   // typedef typename big_span_t span_t;
511   typedef T span_t;
512 
513   auto &bbounds = bounds->b;
514 
515   if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
516     // This dimention depends on one of previous ones; can't be the outermost
517     // one.
518     bounds_info_internalXX_template<T> *previous =
519         reinterpret_cast<bounds_info_internalXX_template<T> *>(
520             &(bounds_nest[bbounds.outer_iv]));
521 
522     // OMPTODO: assert that T is compatible with loop variable type on
523     // 'previous' loop
524 
525     {
526       span_t bound_candidate1 =
527           bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
528       span_t bound_candidate2 =
529           bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
530       if (bound_candidate1 < bound_candidate2) {
531         bounds->span_smallest = bound_candidate1;
532       } else {
533         bounds->span_smallest = bound_candidate2;
534       }
535     }
536 
537     {
538       // We can't adjust the upper bound with respect to step, because
539       // lower bound might be off after adjustments
540 
541       span_t bound_candidate1 =
542           bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
543       span_t bound_candidate2 =
544           bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
545       if (bound_candidate1 < bound_candidate2) {
546         bounds->span_biggest = bound_candidate2;
547       } else {
548         bounds->span_biggest = bound_candidate1;
549       }
550     }
551   } else {
552     // Rectangular:
553     bounds->span_smallest = bbounds.lb0;
554     bounds->span_biggest = bbounds.ub0;
555   }
556   if (!bounds->loop_bounds_adjusted) {
557     // Here it's safe to reduce the space to the multiply of step.
558     // OMPTODO: check if the formular is correct.
559     // Also check if it would be safe to do this if we didn't adjust left side.
560     bounds->span_biggest -=
561         (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
562   }
563 }
564 
565 // Calculate span for IV on this loop level for ">=" case.
566 template <typename T>
567 void kmp_calc_span_greateroreq_XX(
568     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
569     /* in/out*/ bounds_info_internal_t *bounds_nest) {
570 
571   typedef typename traits_t<T>::unsigned_t UT;
572   // typedef typename traits_t<T>::signed_t ST;
573 
574   // typedef typename big_span_t span_t;
575   typedef T span_t;
576 
577   auto &bbounds = bounds->b;
578 
579   if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
580     // This dimention depends on one of previous ones; can't be the outermost
581     // one.
582     bounds_info_internalXX_template<T> *previous =
583         reinterpret_cast<bounds_info_internalXX_template<T> *>(
584             &(bounds_nest[bbounds.outer_iv]));
585 
586     // OMPTODO: assert that T is compatible with loop variable type on
587     // 'previous' loop
588 
589     {
590       span_t bound_candidate1 =
591           bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
592       span_t bound_candidate2 =
593           bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
594       if (bound_candidate1 >= bound_candidate2) {
595         bounds->span_smallest = bound_candidate1;
596       } else {
597         bounds->span_smallest = bound_candidate2;
598       }
599     }
600 
601     {
602       // We can't adjust the upper bound with respect to step, because
603       // lower bound might be off after adjustments
604 
605       span_t bound_candidate1 =
606           bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
607       span_t bound_candidate2 =
608           bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
609       if (bound_candidate1 >= bound_candidate2) {
610         bounds->span_biggest = bound_candidate2;
611       } else {
612         bounds->span_biggest = bound_candidate1;
613       }
614     }
615 
616   } else {
617     // Rectangular:
618     bounds->span_biggest = bbounds.lb0;
619     bounds->span_smallest = bbounds.ub0;
620   }
621   if (!bounds->loop_bounds_adjusted) {
622     // Here it's safe to reduce the space to the multiply of step.
623     // OMPTODO: check if the formular is correct.
624     // Also check if it would be safe to do this if we didn't adjust left side.
625     bounds->span_biggest -=
626         (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
627   }
628 }
629 
630 // Calculate maximum possible span for IV on this loop level.
631 template <typename T>
632 void kmp_calc_span_XX(
633     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
634     /* in/out*/ bounds_info_internal_t *bounds_nest) {
635 
636   if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
637     kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
638   } else {
639     KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
640     kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
641   }
642 }
643 
644 //----------All initial processing of the loop nest---------------------------
645 
646 // Calculate new bounds for this loop level.
647 // To be able to work with the nest we need to get it to a parallelepiped shape.
648 // We need to stay in the original range of values, so that there will be no
649 // overflow, for that we'll adjust both upper and lower bounds as needed.
650 template <typename T>
651 void kmp_calc_new_bounds_XX(
652     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
653     /* in/out*/ bounds_info_internal_t *bounds_nest) {
654 
655   auto &bbounds = bounds->b;
656 
657   if (bbounds.lb1 == bbounds.ub1) {
658     // Already parallel, no need to adjust:
659     bounds->loop_bounds_adjusted = false;
660   } else {
661     bounds->loop_bounds_adjusted = true;
662 
663     T old_lb1 = bbounds.lb1;
664     T old_ub1 = bbounds.ub1;
665 
666     if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
667       // With this shape we can adjust to a rectangle:
668       bbounds.lb1 = 0;
669       bbounds.ub1 = 0;
670     } else {
671       // get upper and lower bounds to be parallel
672       // with values in the old range.
673       // Note: abs didn't work here.
674       if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||
675           ((old_lb1 > 0) && (old_lb1 > old_ub1))) {
676         bbounds.lb1 = old_ub1;
677       } else {
678         bbounds.ub1 = old_lb1;
679       }
680     }
681 
682     // Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
683     // The idea here that for this IV we are now getting the same span
684     // irrespective of the previous IV value.
685     bounds_info_internalXX_template<T> *previous =
686         reinterpret_cast<bounds_info_internalXX_template<T> *>(
687             &bounds_nest[bbounds.outer_iv]);
688 
689     if (bbounds.comparison == comparison_t::comp_less_or_eq) {
690       if (old_lb1 < bbounds.lb1) {
691         KMP_ASSERT(old_lb1 < 0);
692         // The length is good on outer_iv biggest number,
693         // can use it to find where to move the lower bound:
694 
695         T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
696         bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?
697                             // e.g. it was 0?? (same below)
698       } else if (old_lb1 > bbounds.lb1) {
699         // still need to move lower bound:
700         T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
701         bbounds.lb0 += add;
702       }
703 
704       if (old_ub1 > bbounds.ub1) {
705         KMP_ASSERT(old_ub1 > 0);
706         // The length is good on outer_iv biggest number,
707         // can use it to find where to move upper bound:
708 
709         T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
710         bbounds.ub0 += add;
711       } else if (old_ub1 < bbounds.ub1) {
712         // still need to move upper bound:
713         T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
714         bbounds.ub0 -= sub;
715       }
716     } else {
717       KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
718       if (old_lb1 < bbounds.lb1) {
719         KMP_ASSERT(old_lb1 < 0);
720         T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
721         bbounds.lb0 -= sub;
722       } else if (old_lb1 > bbounds.lb1) {
723         T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
724         bbounds.lb0 += add;
725       }
726 
727       if (old_ub1 > bbounds.ub1) {
728         KMP_ASSERT(old_ub1 > 0);
729         T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
730         bbounds.ub0 += add;
731       } else if (old_ub1 < bbounds.ub1) {
732         T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
733         bbounds.ub0 -= sub;
734       }
735     }
736   }
737 }
738 
739 // Do all processing for one canonicalized loop in the nest
740 // (assuming that outer loops already were processed):
741 template <typename T>
742 kmp_loop_nest_iv_t kmp_process_one_loop_XX(
743     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
744     /*in/out*/ bounds_info_internal_t *bounds_nest) {
745 
746   kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
747   kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
748   return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b));
749 }
750 
751 // Non-rectangular loop nest, canonicalized to use <= or >=.
752 // Process loop nest to have a parallelepiped shape,
753 // calculate biggest spans for IV's on all levels and calculate overall trip
754 // count. "bounds_nest" has to be allocated per thread.
755 // Returns overall trip count (for adjusted space).
756 kmp_loop_nest_iv_t kmp_process_loop_nest(
757     /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) {
758 
759   kmp_loop_nest_iv_t total = 1;
760 
761   for (kmp_index_t ind = 0; ind < n; ++ind) {
762     auto bounds = &(bounds_nest[ind]);
763     kmp_loop_nest_iv_t trip_count = 0;
764 
765     switch (bounds->b.loop_type) {
766     case loop_type_t::loop_type_int32:
767       trip_count = kmp_process_one_loop_XX<kmp_int32>(
768           /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds),
769           /*in/out*/ bounds_nest);
770       break;
771     case loop_type_t::loop_type_uint32:
772       trip_count = kmp_process_one_loop_XX<kmp_uint32>(
773           /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
774           /*in/out*/ bounds_nest);
775       break;
776     case loop_type_t::loop_type_int64:
777       trip_count = kmp_process_one_loop_XX<kmp_int64>(
778           /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds),
779           /*in/out*/ bounds_nest);
780       break;
781     case loop_type_t::loop_type_uint64:
782       trip_count = kmp_process_one_loop_XX<kmp_uint64>(
783           /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
784           /*in/out*/ bounds_nest);
785       break;
786     default:
787       KMP_ASSERT(false);
788     }
789     total *= trip_count;
790   }
791 
792   return total;
793 }
794 
795 //----------Calculate iterations (in the original or updated space)-----------
796 
797 // Calculate number of iterations in original or updated space resulting in
798 // original_ivs[ind] (only on this level, non-negative)
799 // (not counting initial iteration)
800 template <typename T>
801 kmp_loop_nest_iv_t
802 kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
803                                  const kmp_point_t original_ivs,
804                                  kmp_index_t ind) {
805 
806   kmp_loop_nest_iv_t iterations = 0;
807 
808   if (bounds->comparison == comparison_t::comp_less_or_eq) {
809     iterations =
810         (static_cast<T>(original_ivs[ind]) - bounds->lb0 -
811          bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
812         __kmp_abs(bounds->step);
813   } else {
814     KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
815     iterations = (bounds->lb0 +
816                   bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
817                   static_cast<T>(original_ivs[ind])) /
818                  __kmp_abs(bounds->step);
819   }
820 
821   return iterations;
822 }
823 
824 // Calculate number of iterations in the original or updated space resulting in
825 // original_ivs[ind] (only on this level, non-negative)
826 kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,
827                                                  const kmp_point_t original_ivs,
828                                                  kmp_index_t ind) {
829 
830   switch (bounds->loop_type) {
831   case loop_type_t::loop_type_int32:
832     return kmp_calc_number_of_iterations_XX<kmp_int32>(
833         (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
834     break;
835   case loop_type_t::loop_type_uint32:
836     return kmp_calc_number_of_iterations_XX<kmp_uint32>(
837         (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
838     break;
839   case loop_type_t::loop_type_int64:
840     return kmp_calc_number_of_iterations_XX<kmp_int64>(
841         (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
842     break;
843   case loop_type_t::loop_type_uint64:
844     return kmp_calc_number_of_iterations_XX<kmp_uint64>(
845         (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
846     break;
847   default:
848     KMP_ASSERT(false);
849     return 0;
850   }
851 }
852 
853 //----------Calculate new iv corresponding to original ivs--------------------
854 
855 // We got a point in the original loop nest.
856 // Take updated bounds and calculate what new_iv will correspond to this point.
857 // When we are getting original IVs from new_iv, we have to adjust to fit into
858 // original loops bounds. Getting new_iv for the adjusted original IVs will help
859 // with making more chunks non-empty.
860 kmp_loop_nest_iv_t
861 kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,
862                                   const kmp_point_t original_ivs,
863                                   kmp_index_t n) {
864 
865   kmp_loop_nest_iv_t new_iv = 0;
866 
867   for (kmp_index_t ind = 0; ind < n; ++ind) {
868     auto bounds = &(bounds_nest[ind].b);
869 
870     new_iv = new_iv * bounds->trip_count +
871              kmp_calc_number_of_iterations(bounds, original_ivs, ind);
872   }
873 
874   return new_iv;
875 }
876 
877 //----------Calculate original ivs for provided iterations--------------------
878 
879 // Calculate original IVs for provided iterations, assuming iterations are
880 // calculated in the original space.
881 // Loop nest is in canonical form (with <= / >=).
882 bool kmp_calc_original_ivs_from_iterations(
883     const bounds_info_t *original_bounds_nest, kmp_index_t n,
884     /*in/out*/ kmp_point_t original_ivs,
885     /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) {
886 
887   kmp_index_t lengthened_ind = n;
888 
889   for (; ind < n;) {
890     auto bounds = &(original_bounds_nest[ind]);
891     bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations,
892                                 ind, (lengthened_ind < ind), true);
893 
894     if (!good) {
895       // The calculated iv value is too big (or too small for >=):
896       if (ind == 0) {
897         // Space is empty:
898         return false;
899       } else {
900         // Go to next iteration on the outer loop:
901         --ind;
902         ++iterations[ind];
903         lengthened_ind = ind;
904         for (kmp_index_t i = ind + 1; i < n; ++i) {
905           iterations[i] = 0;
906         }
907         continue;
908       }
909     }
910     ++ind;
911   }
912 
913   return true;
914 }
915 
916 //----------Calculate original ivs for the beginning of the loop nest---------
917 
918 // Calculate IVs for the beginning of the loop nest.
919 // Note: lower bounds of all loops may not work -
920 // if on some of the iterations of the outer loops inner loops are empty.
921 // Loop nest is in canonical form (with <= / >=).
922 bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
923                                      kmp_index_t n,
924                                      /*out*/ kmp_point_t original_ivs) {
925 
926   // Iterations in the original space, multiplied by step:
927   kmp_iterations_t iterations =
928       (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
929 
930   for (kmp_index_t ind = n; ind > 0;) {
931     --ind;
932     iterations[ind] = 0;
933   }
934 
935   // Now calculate the point:
936   bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
937                                                  /*in/out*/ original_ivs,
938                                                  /*in/out*/ iterations, 0);
939   __kmp_free(iterations);
940   return b;
941 }
942 
943 //----------Calculate next point in the original loop space-------------------
944 
945 // From current set of original IVs calculate next point.
946 // Return false if there is no next point in the loop bounds.
947 bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
948                                 kmp_index_t n, const kmp_point_t original_ivs,
949                                 /*out*/ kmp_point_t next_original_ivs) {
950   // Iterations in the original space, multiplied by step (so can be negative):
951   kmp_iterations_t iterations =
952       (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
953 
954   // First, calc corresponding iteration in every original loop:
955   for (kmp_index_t ind = 0; ind < n; ++ind) {
956     auto bounds = &(original_bounds_nest[ind]);
957     iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
958   }
959 
960   for (kmp_index_t ind = 0; ind < n; ++ind) {
961     next_original_ivs[ind] = original_ivs[ind];
962   }
963 
964   // Next add one step to the iterations on the inner-most level, and see if we
965   // need to move up the nest:
966   kmp_index_t ind = n - 1;
967   ++iterations[ind];
968 
969   bool b = kmp_calc_original_ivs_from_iterations(
970       original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);
971 
972   __kmp_free(iterations);
973   return b;
974 }
975 
976 //----------Calculate chunk end in the original loop space--------------------
977 
978 // For one level calculate old induction variable corresponding to overall
979 // new_iv for the chunk end.
980 // Return true if it fits into upper bound on this level
981 // (if not, we need to re-calculate)
982 template <typename T>
983 bool kmp_calc_one_iv_for_chunk_end_XX(
984     const bounds_infoXX_template<T> *bounds,
985     const bounds_infoXX_template<T> *updated_bounds,
986     /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations,
987     kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
988     const kmp_point_t original_ivs_start) {
989 
990   // typedef  std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
991   // big_span_t;
992 
993   // OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
994   T temp = 0;
995 
996   T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
997 
998   if (start_with_lower_bound) {
999     // we moved to the next iteration on one of outer loops, may as well use
1000     // the lower bound here:
1001     temp = bounds->lb0 + bounds->lb1 * outer_iv;
1002   } else {
1003     // Start in expanded space, but:
1004     // - we need to hit original space lower bound, so need to account for
1005     // that
1006     // - we have to go into original space, even if that means adding more
1007     // iterations than was planned
1008     // - we have to go past (or equal to) previous point (which is the chunk
1009     // starting point)
1010 
1011     auto iteration = iterations[ind];
1012 
1013     auto step = bounds->step;
1014 
1015     // In case of >= it's negative:
1016     auto accountForStep =
1017         ((bounds->lb0 + bounds->lb1 * outer_iv) -
1018          (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
1019         step;
1020 
1021     temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
1022            accountForStep + iteration * step;
1023 
1024     if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1025          (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
1026         ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1027          (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
1028       // Too small (or too big), didn't reach the original lower bound. Use
1029       // heuristic:
1030       temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;
1031     }
1032 
1033     if (compare_with_start) {
1034 
1035       T start = static_cast<T>(original_ivs_start[ind]);
1036 
1037       temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1038 
1039       // On all previous levels start of the chunk is same as the end, need to
1040       // be really careful here:
1041       if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1042            (temp < start)) ||
1043           ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1044            (temp > start))) {
1045         // End of the chunk can't be smaller (for >= bigger) than it's start.
1046         // Use heuristic:
1047         temp = start + iteration / 4 * step;
1048       }
1049     }
1050   }
1051 
1052   original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1053 
1054   if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1055        (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
1056       ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1057        (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
1058     // Too big (or too small for >=).
1059     return false;
1060   }
1061 
1062   return true;
1063 }
1064 
1065 // For one level calculate old induction variable corresponding to overall
1066 // new_iv for the chunk end.
1067 bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
1068                                    const bounds_info_t *updated_bounds,
1069                                    /*in/out*/ kmp_point_t original_ivs,
1070                                    const kmp_iterations_t iterations,
1071                                    kmp_index_t ind, bool start_with_lower_bound,
1072                                    bool compare_with_start,
1073                                    const kmp_point_t original_ivs_start) {
1074 
1075   switch (bounds->loop_type) {
1076   case loop_type_t::loop_type_int32:
1077     return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
1078         (bounds_infoXX_template<kmp_int32> *)(bounds),
1079         (bounds_infoXX_template<kmp_int32> *)(updated_bounds),
1080         /*in/out*/
1081         original_ivs, iterations, ind, start_with_lower_bound,
1082         compare_with_start, original_ivs_start);
1083     break;
1084   case loop_type_t::loop_type_uint32:
1085     return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
1086         (bounds_infoXX_template<kmp_uint32> *)(bounds),
1087         (bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
1088         /*in/out*/
1089         original_ivs, iterations, ind, start_with_lower_bound,
1090         compare_with_start, original_ivs_start);
1091     break;
1092   case loop_type_t::loop_type_int64:
1093     return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
1094         (bounds_infoXX_template<kmp_int64> *)(bounds),
1095         (bounds_infoXX_template<kmp_int64> *)(updated_bounds),
1096         /*in/out*/
1097         original_ivs, iterations, ind, start_with_lower_bound,
1098         compare_with_start, original_ivs_start);
1099     break;
1100   case loop_type_t::loop_type_uint64:
1101     return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
1102         (bounds_infoXX_template<kmp_uint64> *)(bounds),
1103         (bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
1104         /*in/out*/
1105         original_ivs, iterations, ind, start_with_lower_bound,
1106         compare_with_start, original_ivs_start);
1107     break;
1108   default:
1109     KMP_ASSERT(false);
1110     return false;
1111   }
1112 }
1113 
1114 // Calculate old induction variables corresponding to overall new_iv for the
1115 // chunk end. If due to space extension we are getting old IVs outside of the
1116 // boundaries, bring them into the boundaries. Need to do this in the runtime,
1117 // esp. on the lower bounds side. When getting result need to make sure that the
1118 // new chunk starts at next position to old chunk, not overlaps with it (this is
1119 // done elsewhere), and need to make sure end of the chunk is further than the
1120 // beginning of the chunk. We don't need an exact ending point here, just
1121 // something more-or-less close to the desired chunk length, bigger is fine
1122 // (smaller would be fine, but we risk going into infinite loop, so do smaller
1123 // only at the very end of the space). result: false if could not find the
1124 // ending point in the original loop space. In this case the caller can use
1125 // original upper bounds as the end of the chunk. Chunk won't be empty, because
1126 // it'll have at least the starting point, which is by construction in the
1127 // original space.
1128 bool kmp_calc_original_ivs_for_chunk_end(
1129     const bounds_info_t *original_bounds_nest, kmp_index_t n,
1130     const bounds_info_internal_t *updated_bounds_nest,
1131     const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
1132     /*out*/ kmp_point_t original_ivs) {
1133 
1134   // Iterations in the expanded space:
1135   kmp_iterations_t iterations =
1136       (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
1137 
1138   // First, calc corresponding iteration in every modified loop:
1139   for (kmp_index_t ind = n; ind > 0;) {
1140     --ind;
1141     auto &updated_bounds = updated_bounds_nest[ind];
1142 
1143     // should be optimized to OPDIVREM:
1144     auto new_ind = new_iv / updated_bounds.b.trip_count;
1145     auto iteration = new_iv % updated_bounds.b.trip_count;
1146 
1147     new_iv = new_ind;
1148     iterations[ind] = iteration;
1149   }
1150   KMP_DEBUG_ASSERT(new_iv == 0);
1151 
1152   kmp_index_t lengthened_ind = n;
1153   kmp_index_t equal_ind = -1;
1154 
1155   // Next calculate the point, but in original loop nest.
1156   for (kmp_index_t ind = 0; ind < n;) {
1157     auto bounds = &(original_bounds_nest[ind]);
1158     auto updated_bounds = &(updated_bounds_nest[ind].b);
1159 
1160     bool good = kmp_calc_one_iv_for_chunk_end(
1161         bounds, updated_bounds,
1162         /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind),
1163         (equal_ind >= ind - 1), original_ivs_start);
1164 
1165     if (!good) {
1166       // Too big (or too small for >=).
1167       if (ind == 0) {
1168         // Need to reduce to the end.
1169         __kmp_free(iterations);
1170         return false;
1171       } else {
1172         // Go to next iteration on outer loop:
1173         --ind;
1174         ++(iterations[ind]);
1175         lengthened_ind = ind;
1176         if (equal_ind >= lengthened_ind) {
1177           // We've changed the number of iterations here,
1178           // can't be same anymore:
1179           equal_ind = lengthened_ind - 1;
1180         }
1181         for (kmp_index_t i = ind + 1; i < n; ++i) {
1182           iterations[i] = 0;
1183         }
1184         continue;
1185       }
1186     }
1187 
1188     if ((equal_ind == ind - 1) &&
1189         (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1190                     original_ivs_start[ind]))) {
1191       equal_ind = ind;
1192     } else if ((equal_ind > ind - 1) &&
1193                !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1194                             original_ivs_start[ind]))) {
1195       equal_ind = ind - 1;
1196     }
1197     ++ind;
1198   }
1199 
1200   __kmp_free(iterations);
1201   return true;
1202 }
1203 
1204 //----------Calculate upper bounds for the last chunk-------------------------
1205 
1206 // Calculate one upper bound for the end.
1207 template <typename T>
1208 void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
1209                             /*in/out*/ kmp_point_t original_ivs,
1210                             kmp_index_t ind) {
1211 
1212   T temp = bounds->ub0 +
1213            bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);
1214 
1215   original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
1216 }
1217 
1218 void kmp_calc_one_iv_end(const bounds_info_t *bounds,
1219                          /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) {
1220 
1221   switch (bounds->loop_type) {
1222   default:
1223     KMP_ASSERT(false);
1224     break;
1225   case loop_type_t::loop_type_int32:
1226     kmp_calc_one_iv_end_XX<kmp_int32>(
1227         (bounds_infoXX_template<kmp_int32> *)(bounds),
1228         /*in/out*/ original_ivs, ind);
1229     break;
1230   case loop_type_t::loop_type_uint32:
1231     kmp_calc_one_iv_end_XX<kmp_uint32>(
1232         (bounds_infoXX_template<kmp_uint32> *)(bounds),
1233         /*in/out*/ original_ivs, ind);
1234     break;
1235   case loop_type_t::loop_type_int64:
1236     kmp_calc_one_iv_end_XX<kmp_int64>(
1237         (bounds_infoXX_template<kmp_int64> *)(bounds),
1238         /*in/out*/ original_ivs, ind);
1239     break;
1240   case loop_type_t::loop_type_uint64:
1241     kmp_calc_one_iv_end_XX<kmp_uint64>(
1242         (bounds_infoXX_template<kmp_uint64> *)(bounds),
1243         /*in/out*/ original_ivs, ind);
1244     break;
1245   }
1246 }
1247 
1248 // Calculate upper bounds for the last loop iteration. Just use original upper
1249 // bounds (adjusted when canonicalized to use <= / >=). No need to check that
1250 // this point is in the original space (it's likely not)
1251 void kmp_calc_original_ivs_for_end(
1252     const bounds_info_t *const original_bounds_nest, kmp_index_t n,
1253     /*out*/ kmp_point_t original_ivs) {
1254   for (kmp_index_t ind = 0; ind < n; ++ind) {
1255     auto bounds = &(original_bounds_nest[ind]);
1256     kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind);
1257   }
1258 }
1259 
1260 //----------Init API for non-rectangular loops--------------------------------
1261 
1262 // Init API for collapsed loops (static, no chunks defined).
1263 // "bounds_nest" has to be allocated per thread.
1264 // API will modify original bounds_nest array to bring it to a canonical form
1265 // (only <= and >=, no !=, <, >). If the original loop nest was already in a
1266 // canonical form there will be no changes to bounds in bounds_nest array
1267 // (only trip counts will be calculated). Internally API will expand the space
1268 // to parallelogram/parallelepiped, calculate total, calculate bounds for the
1269 // chunks in terms of the new IV, re-calc them in terms of old IVs (especially
1270 // important on the left side, to hit the lower bounds and not step over), and
1271 // pick the correct chunk for this thread (so it will calculate chunks up to the
1272 // needed one). It could be optimized to calculate just this chunk, potentially
1273 // a bit less well distributed among threads. It is designed to make sure that
1274 // threads will receive predictable chunks, deterministically (so that next nest
1275 // of loops with similar characteristics will get exactly same chunks on same
1276 // threads).
1277 // Current contract: chunk_bounds_nest has only lb0 and ub0,
1278 // lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
1279 extern "C" kmp_int32
1280 __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1281                           /*in/out*/ bounds_info_t *original_bounds_nest,
1282                           /*out*/ bounds_info_t *chunk_bounds_nest,
1283                           kmp_index_t n, /*out*/ kmp_int32 *plastiter) {
1284 
1285   KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
1286   KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));
1287 
1288   if (__kmp_env_consistency_check) {
1289     __kmp_push_workshare(gtid, ct_pdo, loc);
1290   }
1291 
1292   kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
1293 
1294   bounds_info_internal_t *updated_bounds_nest =
1295       (bounds_info_internal_t *)__kmp_allocate(sizeof(bounds_info_internal_t) *
1296                                                n);
1297 
1298   for (kmp_index_t i = 0; i < n; ++i) {
1299     updated_bounds_nest[i].b = original_bounds_nest[i];
1300   }
1301 
1302   kmp_loop_nest_iv_t total =
1303       kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n);
1304 
1305   if (plastiter != NULL) {
1306     *plastiter = FALSE;
1307   }
1308 
1309   if (total == 0) {
1310     // Loop won't execute:
1311     __kmp_free(updated_bounds_nest);
1312     return FALSE;
1313   }
1314 
1315   // OMPTODO: DISTRIBUTE is not supported yet
1316   __kmp_assert_valid_gtid(gtid);
1317   kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
1318 
1319   kmp_info_t *th = __kmp_threads[gtid];
1320   kmp_team_t *team = th->th.th_team;
1321   kmp_uint32 nth = team->t.t_nproc; // Number of threads
1322 
1323   KMP_DEBUG_ASSERT(tid < nth);
1324 
1325   kmp_point_t original_ivs_start =
1326       (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
1327   kmp_point_t original_ivs_end =
1328       (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
1329   kmp_point_t original_ivs_next_start =
1330       (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
1331 
1332   if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
1333                                        /*out*/ original_ivs_start)) {
1334     // Loop won't execute:
1335     __kmp_free(updated_bounds_nest);
1336     __kmp_free(original_ivs_start);
1337     __kmp_free(original_ivs_end);
1338     __kmp_free(original_ivs_next_start);
1339     return FALSE;
1340   }
1341 
1342   // Not doing this optimization for one thread:
1343   // (1) more to test
1344   // (2) without it current contract that chunk_bounds_nest has only lb0 and
1345   // ub0, lb1 and ub1 are set to 0 and can be ignored.
1346   // if (nth == 1) {
1347   //  // One thread:
1348   //  // Copy all info from original_bounds_nest, it'll be good enough.
1349 
1350   //  for (kmp_index_t i = 0; i < n; ++i) {
1351   //    chunk_bounds_nest[i] = original_bounds_nest[i];
1352   //  }
1353 
1354   //  if (plastiter != NULL) {
1355   //    *plastiter = TRUE;
1356   //  }
1357   //  __kmp_free(updated_bounds_nest);
1358   //  __kmp_free(original_ivs_start);
1359   //  __kmp_free(original_ivs_end);
1360   //  __kmp_free(original_ivs_next_start);
1361   //  return TRUE;
1362   //}
1363 
1364   kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
1365       updated_bounds_nest, original_ivs_start, n);
1366 
1367   bool last_iter = false;
1368 
1369   for (; nth > 0;) {
1370     // We could calculate chunk size once, but this is to compensate that the
1371     // original space is not parallelepiped and some threads can be left
1372     // without work:
1373     KMP_DEBUG_ASSERT(total >= new_iv);
1374 
1375     kmp_loop_nest_iv_t total_left = total - new_iv;
1376     kmp_loop_nest_iv_t chunk_size = total_left / nth;
1377     kmp_loop_nest_iv_t remainder = total_left % nth;
1378 
1379     kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
1380 
1381     if (remainder > 0) {
1382       ++curr_chunk_size;
1383       --remainder;
1384     }
1385 
1386 #if defined(KMP_DEBUG)
1387     kmp_loop_nest_iv_t new_iv_for_start = new_iv;
1388 #endif
1389 
1390     if (curr_chunk_size > 1) {
1391       new_iv += curr_chunk_size - 1;
1392     }
1393 
1394     if ((nth == 1) || (new_iv >= total - 1)) {
1395       // Do this one till the end - just in case we miscalculated
1396       // and either too much is left to process or new_iv is a bit too big:
1397       kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1398                                     /*out*/ original_ivs_end);
1399 
1400       last_iter = true;
1401     } else {
1402       // Note: here we make sure it's past (or equal to) the previous point.
1403       if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
1404                                                updated_bounds_nest,
1405                                                original_ivs_start, new_iv,
1406                                                /*out*/ original_ivs_end)) {
1407         // We could not find the ending point, use the original upper bounds:
1408         kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1409                                       /*out*/ original_ivs_end);
1410 
1411         last_iter = true;
1412       }
1413     }
1414 
1415 #if defined(KMP_DEBUG)
1416     auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
1417         updated_bounds_nest, original_ivs_end, n);
1418     KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
1419 #endif
1420 
1421     if (last_iter && (tid != 0)) {
1422       // We are done, this was last chunk, but no chunk for current thread was
1423       // found:
1424       __kmp_free(updated_bounds_nest);
1425       __kmp_free(original_ivs_start);
1426       __kmp_free(original_ivs_end);
1427       __kmp_free(original_ivs_next_start);
1428       return FALSE;
1429     }
1430 
1431     if (tid == 0) {
1432       // We found the chunk for this thread, now we need to check if it's the
1433       // last chunk or not:
1434 
1435       if (last_iter ||
1436           !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
1437                                       /*out*/ original_ivs_next_start)) {
1438         // no more loop iterations left to process,
1439         // this means that currently found chunk is the last chunk:
1440         if (plastiter != NULL) {
1441           *plastiter = TRUE;
1442         }
1443       }
1444 
1445       // Fill in chunk bounds:
1446       for (kmp_index_t i = 0; i < n; ++i) {
1447         chunk_bounds_nest[i] =
1448             original_bounds_nest[i]; // To fill in types, etc. - optional
1449         chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
1450         chunk_bounds_nest[i].lb1_u64 = 0;
1451 
1452         chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
1453         chunk_bounds_nest[i].ub1_u64 = 0;
1454       }
1455 
1456       __kmp_free(updated_bounds_nest);
1457       __kmp_free(original_ivs_start);
1458       __kmp_free(original_ivs_end);
1459       __kmp_free(original_ivs_next_start);
1460       return TRUE;
1461     }
1462 
1463     --tid;
1464     --nth;
1465 
1466     bool next_chunk = kmp_calc_next_original_ivs(
1467         original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start);
1468     if (!next_chunk) {
1469       // no more loop iterations to process,
1470       // the prevoius chunk was the last chunk
1471       break;
1472     }
1473 
1474     // original_ivs_start is next to previous chunk original_ivs_end,
1475     // we need to start new chunk here, so chunks will be one after another
1476     // without any gap or overlap:
1477     new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
1478                                                original_ivs_start, n);
1479   }
1480 
1481   __kmp_free(updated_bounds_nest);
1482   __kmp_free(original_ivs_start);
1483   __kmp_free(original_ivs_end);
1484   __kmp_free(original_ivs_next_start);
1485   return FALSE;
1486 }
1487