1 /*
2 * kmp_collapse.cpp -- loop collapse feature
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "kmp.h"
14 #include "kmp_error.h"
15 #include "kmp_i18n.h"
16 #include "kmp_itt.h"
17 #include "kmp_stats.h"
18 #include "kmp_str.h"
19 #include "kmp_collapse.h"
20
21 #if OMPT_SUPPORT
22 #include "ompt-specific.h"
23 #endif
24
25 // OMPTODO: different style of comments (see kmp_sched)
26 // OMPTODO: OMPT/OMPD
27
28 // avoid inadevertently using a library based abs
__kmp_abs(const T val)29 template <typename T> T __kmp_abs(const T val) {
30 return (val < 0) ? -val : val;
31 }
__kmp_abs(const kmp_uint32 val)32 kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
__kmp_abs(const kmp_uint64 val)33 kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
34
35 //----------------------------------------------------------------------------
36 // Common functions for working with rectangular and non-rectangular loops
37 //----------------------------------------------------------------------------
38
__kmp_sign(T val)39 template <typename T> int __kmp_sign(T val) {
40 return (T(0) < val) - (val < T(0));
41 }
42
43 template <typename T> class CollapseAllocator {
44 typedef T *pT;
45
46 private:
47 static const size_t allocaSize = 32; // size limit for stack allocations
48 // (8 bytes x 4 nested loops)
49 char stackAlloc[allocaSize];
50 static constexpr size_t maxElemCount = allocaSize / sizeof(T);
51 pT pTAlloc;
52
53 public:
CollapseAllocator(size_t n)54 CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) {
55 if (n > maxElemCount) {
56 pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T)));
57 }
58 }
~CollapseAllocator()59 ~CollapseAllocator() {
60 if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) {
61 __kmp_free(pTAlloc);
62 }
63 }
operator [](int index)64 T &operator[](int index) { return pTAlloc[index]; }
operator const pT()65 operator const pT() { return pTAlloc; }
66 };
67
68 //----------Loop canonicalization---------------------------------------------
69
70 // For loop nest (any shape):
71 // convert != to < or >;
72 // switch from using < or > to <= or >=.
73 // "bounds" array has to be allocated per thread.
74 // All other internal functions will work only with canonicalized loops.
75 template <typename T>
kmp_canonicalize_one_loop_XX(ident_t * loc,bounds_infoXX_template<T> * bounds)76 void kmp_canonicalize_one_loop_XX(
77 ident_t *loc,
78 /*in/out*/ bounds_infoXX_template<T> *bounds) {
79
80 if (__kmp_env_consistency_check) {
81 if (bounds->step == 0) {
82 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
83 loc);
84 }
85 }
86
87 if (bounds->comparison == comparison_t::comp_not_eq) {
88 // We can convert this to < or >, depends on the sign of the step:
89 if (bounds->step > 0) {
90 bounds->comparison = comparison_t::comp_less;
91 } else {
92 bounds->comparison = comparison_t::comp_greater;
93 }
94 }
95
96 if (bounds->comparison == comparison_t::comp_less) {
97 // Note: ub0 can be unsigned. Should be Ok to hit overflow here,
98 // because ub0 + ub1*j should be still positive (otherwise loop was not
99 // well formed)
100 bounds->ub0 -= 1;
101 bounds->comparison = comparison_t::comp_less_or_eq;
102 } else if (bounds->comparison == comparison_t::comp_greater) {
103 bounds->ub0 += 1;
104 bounds->comparison = comparison_t::comp_greater_or_eq;
105 }
106 }
107
108 // Canonicalize loop nest. original_bounds_nest is an array of length n.
kmp_canonicalize_loop_nest(ident_t * loc,bounds_info_t * original_bounds_nest,kmp_index_t n)109 void kmp_canonicalize_loop_nest(ident_t *loc,
110 /*in/out*/ bounds_info_t *original_bounds_nest,
111 kmp_index_t n) {
112
113 for (kmp_index_t ind = 0; ind < n; ++ind) {
114 auto bounds = &(original_bounds_nest[ind]);
115
116 switch (bounds->loop_type) {
117 case loop_type_t::loop_type_int32:
118 kmp_canonicalize_one_loop_XX<kmp_int32>(
119 loc,
120 /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
121 break;
122 case loop_type_t::loop_type_uint32:
123 kmp_canonicalize_one_loop_XX<kmp_uint32>(
124 loc,
125 /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
126 break;
127 case loop_type_t::loop_type_int64:
128 kmp_canonicalize_one_loop_XX<kmp_int64>(
129 loc,
130 /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
131 break;
132 case loop_type_t::loop_type_uint64:
133 kmp_canonicalize_one_loop_XX<kmp_uint64>(
134 loc,
135 /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
136 break;
137 default:
138 KMP_ASSERT(false);
139 }
140 }
141 }
142
143 //----------Calculating trip count on one level-------------------------------
144
145 // Calculate trip count on this loop level.
146 // We do this either for a rectangular loop nest,
147 // or after an adjustment bringing the loops to a parallelepiped shape.
148 // This number should not depend on the value of outer IV
149 // even if the formular has lb1 and ub1.
150 // Note: for non-rectangular loops don't use span for this, it's too big.
151
152 template <typename T>
kmp_calculate_trip_count_XX(bounds_infoXX_template<T> * bounds)153 kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
154 /*in/out*/ bounds_infoXX_template<T> *bounds) {
155
156 if (bounds->comparison == comparison_t::comp_less_or_eq) {
157 if (bounds->ub0 < bounds->lb0) {
158 // Note: after this we don't need to calculate inner loops,
159 // but that should be an edge case:
160 bounds->trip_count = 0;
161 } else {
162 // ub - lb may exceed signed type range; we need to cast to
163 // kmp_loop_nest_iv_t anyway
164 bounds->trip_count =
165 static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
166 __kmp_abs(bounds->step) +
167 1;
168 }
169 } else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
170 if (bounds->lb0 < bounds->ub0) {
171 // Note: after this we don't need to calculate inner loops,
172 // but that should be an edge case:
173 bounds->trip_count = 0;
174 } else {
175 // lb - ub may exceed signed type range; we need to cast to
176 // kmp_loop_nest_iv_t anyway
177 bounds->trip_count =
178 static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
179 __kmp_abs(bounds->step) +
180 1;
181 }
182 } else {
183 KMP_ASSERT(false);
184 }
185 return bounds->trip_count;
186 }
187
188 // Calculate trip count on this loop level.
kmp_calculate_trip_count(bounds_info_t * bounds)189 kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) {
190
191 kmp_loop_nest_iv_t trip_count = 0;
192
193 switch (bounds->loop_type) {
194 case loop_type_t::loop_type_int32:
195 trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
196 /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
197 break;
198 case loop_type_t::loop_type_uint32:
199 trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
200 /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
201 break;
202 case loop_type_t::loop_type_int64:
203 trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
204 /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
205 break;
206 case loop_type_t::loop_type_uint64:
207 trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
208 /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
209 break;
210 default:
211 KMP_ASSERT(false);
212 }
213
214 return trip_count;
215 }
216
217 //----------Trim original iv according to its type----------------------------
218
219 // Trim original iv according to its type.
220 // Return kmp_uint64 value which can be easily used in all internal calculations
221 // And can be statically cast back to original type in user code.
kmp_fix_iv(loop_type_t loop_iv_type,kmp_uint64 original_iv)222 kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
223 kmp_uint64 res = 0;
224
225 switch (loop_iv_type) {
226 case loop_type_t::loop_type_int8:
227 res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
228 break;
229 case loop_type_t::loop_type_uint8:
230 res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
231 break;
232 case loop_type_t::loop_type_int16:
233 res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
234 break;
235 case loop_type_t::loop_type_uint16:
236 res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
237 break;
238 case loop_type_t::loop_type_int32:
239 res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
240 break;
241 case loop_type_t::loop_type_uint32:
242 res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
243 break;
244 case loop_type_t::loop_type_int64:
245 res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
246 break;
247 case loop_type_t::loop_type_uint64:
248 res = static_cast<kmp_uint64>(original_iv);
249 break;
250 default:
251 KMP_ASSERT(false);
252 }
253
254 return res;
255 }
256
257 //----------Compare two IVs (remember they have a type)-----------------------
258
kmp_ivs_eq(loop_type_t loop_iv_type,kmp_uint64 original_iv1,kmp_uint64 original_iv2)259 bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
260 kmp_uint64 original_iv2) {
261 bool res = false;
262
263 switch (loop_iv_type) {
264 case loop_type_t::loop_type_int8:
265 res = static_cast<kmp_int8>(original_iv1) ==
266 static_cast<kmp_int8>(original_iv2);
267 break;
268 case loop_type_t::loop_type_uint8:
269 res = static_cast<kmp_uint8>(original_iv1) ==
270 static_cast<kmp_uint8>(original_iv2);
271 break;
272 case loop_type_t::loop_type_int16:
273 res = static_cast<kmp_int16>(original_iv1) ==
274 static_cast<kmp_int16>(original_iv2);
275 break;
276 case loop_type_t::loop_type_uint16:
277 res = static_cast<kmp_uint16>(original_iv1) ==
278 static_cast<kmp_uint16>(original_iv2);
279 break;
280 case loop_type_t::loop_type_int32:
281 res = static_cast<kmp_int32>(original_iv1) ==
282 static_cast<kmp_int32>(original_iv2);
283 break;
284 case loop_type_t::loop_type_uint32:
285 res = static_cast<kmp_uint32>(original_iv1) ==
286 static_cast<kmp_uint32>(original_iv2);
287 break;
288 case loop_type_t::loop_type_int64:
289 res = static_cast<kmp_int64>(original_iv1) ==
290 static_cast<kmp_int64>(original_iv2);
291 break;
292 case loop_type_t::loop_type_uint64:
293 res = static_cast<kmp_uint64>(original_iv1) ==
294 static_cast<kmp_uint64>(original_iv2);
295 break;
296 default:
297 KMP_ASSERT(false);
298 }
299
300 return res;
301 }
302
303 //----------Calculate original iv on one level--------------------------------
304
305 // Return true if the point fits into upper bounds on this level,
306 // false otherwise
307 template <typename T>
kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> * bounds,const kmp_point_t original_ivs,kmp_index_t ind)308 bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
309 const kmp_point_t original_ivs,
310 kmp_index_t ind) {
311
312 T iv = static_cast<T>(original_ivs[ind]);
313 T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
314
315 if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
316 (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
317 ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
318 (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
319 // The calculated point is outside of loop upper boundary:
320 return false;
321 }
322
323 return true;
324 }
325
326 // Calculate one iv corresponding to iteration on the level ind.
327 // Return true if it fits into lower-upper bounds on this level
328 // (if not, we need to re-calculate)
329 template <typename T>
kmp_calc_one_iv_XX(const bounds_infoXX_template<T> * bounds,kmp_point_t original_ivs,const kmp_iterations_t iterations,kmp_index_t ind,bool start_with_lower_bound,bool checkBounds)330 bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
331 /*in/out*/ kmp_point_t original_ivs,
332 const kmp_iterations_t iterations, kmp_index_t ind,
333 bool start_with_lower_bound, bool checkBounds) {
334
335 kmp_uint64 temp = 0;
336 T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
337
338 if (start_with_lower_bound) {
339 // we moved to the next iteration on one of outer loops, should start
340 // with the lower bound here:
341 temp = bounds->lb0 + bounds->lb1 * outer_iv;
342 } else {
343 auto iteration = iterations[ind];
344 temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
345 }
346
347 // Now trim original iv according to its type:
348 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
349
350 if (checkBounds) {
351 return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
352 } else {
353 return true;
354 }
355 }
356
kmp_calc_one_iv(const bounds_info_t * bounds,kmp_point_t original_ivs,const kmp_iterations_t iterations,kmp_index_t ind,bool start_with_lower_bound,bool checkBounds)357 bool kmp_calc_one_iv(const bounds_info_t *bounds,
358 /*in/out*/ kmp_point_t original_ivs,
359 const kmp_iterations_t iterations, kmp_index_t ind,
360 bool start_with_lower_bound, bool checkBounds) {
361
362 switch (bounds->loop_type) {
363 case loop_type_t::loop_type_int32:
364 return kmp_calc_one_iv_XX<kmp_int32>(
365 (bounds_infoXX_template<kmp_int32> *)(bounds),
366 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
367 checkBounds);
368 break;
369 case loop_type_t::loop_type_uint32:
370 return kmp_calc_one_iv_XX<kmp_uint32>(
371 (bounds_infoXX_template<kmp_uint32> *)(bounds),
372 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
373 checkBounds);
374 break;
375 case loop_type_t::loop_type_int64:
376 return kmp_calc_one_iv_XX<kmp_int64>(
377 (bounds_infoXX_template<kmp_int64> *)(bounds),
378 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
379 checkBounds);
380 break;
381 case loop_type_t::loop_type_uint64:
382 return kmp_calc_one_iv_XX<kmp_uint64>(
383 (bounds_infoXX_template<kmp_uint64> *)(bounds),
384 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
385 checkBounds);
386 break;
387 default:
388 KMP_ASSERT(false);
389 return false;
390 }
391 }
392
393 //----------Calculate original iv on one level for rectangular loop nest------
394
395 // Calculate one iv corresponding to iteration on the level ind.
396 // Return true if it fits into lower-upper bounds on this level
397 // (if not, we need to re-calculate)
398 template <typename T>
kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> * bounds,kmp_uint64 * original_ivs,const kmp_iterations_t iterations,kmp_index_t ind)399 void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
400 /*in/out*/ kmp_uint64 *original_ivs,
401 const kmp_iterations_t iterations,
402 kmp_index_t ind) {
403
404 auto iteration = iterations[ind];
405
406 kmp_uint64 temp =
407 bounds->lb0 +
408 bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
409 iteration * bounds->step;
410
411 // Now trim original iv according to its type:
412 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
413 }
414
kmp_calc_one_iv_rectang(const bounds_info_t * bounds,kmp_uint64 * original_ivs,const kmp_iterations_t iterations,kmp_index_t ind)415 void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
416 /*in/out*/ kmp_uint64 *original_ivs,
417 const kmp_iterations_t iterations,
418 kmp_index_t ind) {
419
420 switch (bounds->loop_type) {
421 case loop_type_t::loop_type_int32:
422 kmp_calc_one_iv_rectang_XX<kmp_int32>(
423 (bounds_infoXX_template<kmp_int32> *)(bounds),
424 /*in/out*/ original_ivs, iterations, ind);
425 break;
426 case loop_type_t::loop_type_uint32:
427 kmp_calc_one_iv_rectang_XX<kmp_uint32>(
428 (bounds_infoXX_template<kmp_uint32> *)(bounds),
429 /*in/out*/ original_ivs, iterations, ind);
430 break;
431 case loop_type_t::loop_type_int64:
432 kmp_calc_one_iv_rectang_XX<kmp_int64>(
433 (bounds_infoXX_template<kmp_int64> *)(bounds),
434 /*in/out*/ original_ivs, iterations, ind);
435 break;
436 case loop_type_t::loop_type_uint64:
437 kmp_calc_one_iv_rectang_XX<kmp_uint64>(
438 (bounds_infoXX_template<kmp_uint64> *)(bounds),
439 /*in/out*/ original_ivs, iterations, ind);
440 break;
441 default:
442 KMP_ASSERT(false);
443 }
444 }
445
446 //----------------------------------------------------------------------------
447 // Rectangular loop nest
448 //----------------------------------------------------------------------------
449
450 //----------Canonicalize loop nest and calculate trip count-------------------
451
452 // Canonicalize loop nest and calculate overall trip count.
453 // "bounds_nest" has to be allocated per thread.
454 // API will modify original bounds_nest array to bring it to a canonical form
455 // (only <= and >=, no !=, <, >). If the original loop nest was already in a
456 // canonical form there will be no changes to bounds in bounds_nest array
457 // (only trip counts will be calculated).
458 // Returns trip count of overall space.
459 extern "C" kmp_loop_nest_iv_t
__kmpc_process_loop_nest_rectang(ident_t * loc,kmp_int32 gtid,bounds_info_t * original_bounds_nest,kmp_index_t n)460 __kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
461 /*in/out*/ bounds_info_t *original_bounds_nest,
462 kmp_index_t n) {
463
464 kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
465
466 kmp_loop_nest_iv_t total = 1;
467
468 for (kmp_index_t ind = 0; ind < n; ++ind) {
469 auto bounds = &(original_bounds_nest[ind]);
470
471 kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds);
472 total *= trip_count;
473 }
474
475 return total;
476 }
477
478 //----------Calculate old induction variables---------------------------------
479
480 // Calculate old induction variables corresponding to overall new_iv.
481 // Note: original IV will be returned as if it had kmp_uint64 type,
482 // will have to be converted to original type in user code.
483 // Note: trip counts should be already calculated by
484 // __kmpc_process_loop_nest_rectang.
485 // OMPTODO: special case 2, 3 nested loops: either do different
486 // interface without array or possibly template this over n
487 extern "C" void
__kmpc_calc_original_ivs_rectang(ident_t * loc,kmp_loop_nest_iv_t new_iv,const bounds_info_t * original_bounds_nest,kmp_uint64 * original_ivs,kmp_index_t n)488 __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
489 const bounds_info_t *original_bounds_nest,
490 /*out*/ kmp_uint64 *original_ivs,
491 kmp_index_t n) {
492
493 CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
494
495 // First, calc corresponding iteration in every original loop:
496 for (kmp_index_t ind = n; ind > 0;) {
497 --ind;
498 auto bounds = &(original_bounds_nest[ind]);
499
500 // should be optimized to OPDIVREM:
501 auto temp = new_iv / bounds->trip_count;
502 auto iteration = new_iv % bounds->trip_count;
503 new_iv = temp;
504
505 iterations[ind] = iteration;
506 }
507 KMP_ASSERT(new_iv == 0);
508
509 for (kmp_index_t ind = 0; ind < n; ++ind) {
510 auto bounds = &(original_bounds_nest[ind]);
511
512 kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
513 }
514 }
515
516 //----------------------------------------------------------------------------
517 // Non-rectangular loop nest
518 //----------------------------------------------------------------------------
519
520 //----------Calculate maximum possible span of iv values on one level---------
521
522 // Calculate span for IV on this loop level for "<=" case.
523 // Note: it's for <= on this loop nest level, so lower bound should be smallest
524 // value, upper bound should be the biggest value. If the loop won't execute,
525 // 'smallest' may be bigger than 'biggest', but we'd better not switch them
526 // around.
527 template <typename T>
kmp_calc_span_lessoreq_XX(bounds_info_internalXX_template<T> * bounds,bounds_info_internal_t * bounds_nest)528 void kmp_calc_span_lessoreq_XX(
529 /* in/out*/ bounds_info_internalXX_template<T> *bounds,
530 /* in/out*/ bounds_info_internal_t *bounds_nest) {
531
532 typedef typename traits_t<T>::unsigned_t UT;
533 // typedef typename traits_t<T>::signed_t ST;
534
535 // typedef typename big_span_t span_t;
536 typedef T span_t;
537
538 auto &bbounds = bounds->b;
539
540 if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
541 // This dimention depends on one of previous ones; can't be the outermost
542 // one.
543 bounds_info_internalXX_template<T> *previous =
544 reinterpret_cast<bounds_info_internalXX_template<T> *>(
545 &(bounds_nest[bbounds.outer_iv]));
546
547 // OMPTODO: assert that T is compatible with loop variable type on
548 // 'previous' loop
549
550 {
551 span_t bound_candidate1 =
552 bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
553 span_t bound_candidate2 =
554 bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
555 if (bound_candidate1 < bound_candidate2) {
556 bounds->span_smallest = bound_candidate1;
557 } else {
558 bounds->span_smallest = bound_candidate2;
559 }
560 }
561
562 {
563 // We can't adjust the upper bound with respect to step, because
564 // lower bound might be off after adjustments
565
566 span_t bound_candidate1 =
567 bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
568 span_t bound_candidate2 =
569 bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
570 if (bound_candidate1 < bound_candidate2) {
571 bounds->span_biggest = bound_candidate2;
572 } else {
573 bounds->span_biggest = bound_candidate1;
574 }
575 }
576 } else {
577 // Rectangular:
578 bounds->span_smallest = bbounds.lb0;
579 bounds->span_biggest = bbounds.ub0;
580 }
581 if (!bounds->loop_bounds_adjusted) {
582 // Here it's safe to reduce the space to the multiply of step.
583 // OMPTODO: check if the formular is correct.
584 // Also check if it would be safe to do this if we didn't adjust left side.
585 bounds->span_biggest -=
586 (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
587 }
588 }
589
590 // Calculate span for IV on this loop level for ">=" case.
591 template <typename T>
kmp_calc_span_greateroreq_XX(bounds_info_internalXX_template<T> * bounds,bounds_info_internal_t * bounds_nest)592 void kmp_calc_span_greateroreq_XX(
593 /* in/out*/ bounds_info_internalXX_template<T> *bounds,
594 /* in/out*/ bounds_info_internal_t *bounds_nest) {
595
596 typedef typename traits_t<T>::unsigned_t UT;
597 // typedef typename traits_t<T>::signed_t ST;
598
599 // typedef typename big_span_t span_t;
600 typedef T span_t;
601
602 auto &bbounds = bounds->b;
603
604 if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
605 // This dimention depends on one of previous ones; can't be the outermost
606 // one.
607 bounds_info_internalXX_template<T> *previous =
608 reinterpret_cast<bounds_info_internalXX_template<T> *>(
609 &(bounds_nest[bbounds.outer_iv]));
610
611 // OMPTODO: assert that T is compatible with loop variable type on
612 // 'previous' loop
613
614 {
615 span_t bound_candidate1 =
616 bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
617 span_t bound_candidate2 =
618 bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
619 if (bound_candidate1 >= bound_candidate2) {
620 bounds->span_smallest = bound_candidate1;
621 } else {
622 bounds->span_smallest = bound_candidate2;
623 }
624 }
625
626 {
627 // We can't adjust the upper bound with respect to step, because
628 // lower bound might be off after adjustments
629
630 span_t bound_candidate1 =
631 bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
632 span_t bound_candidate2 =
633 bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
634 if (bound_candidate1 >= bound_candidate2) {
635 bounds->span_biggest = bound_candidate2;
636 } else {
637 bounds->span_biggest = bound_candidate1;
638 }
639 }
640
641 } else {
642 // Rectangular:
643 bounds->span_biggest = bbounds.lb0;
644 bounds->span_smallest = bbounds.ub0;
645 }
646 if (!bounds->loop_bounds_adjusted) {
647 // Here it's safe to reduce the space to the multiply of step.
648 // OMPTODO: check if the formular is correct.
649 // Also check if it would be safe to do this if we didn't adjust left side.
650 bounds->span_biggest -=
651 (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
652 }
653 }
654
655 // Calculate maximum possible span for IV on this loop level.
656 template <typename T>
kmp_calc_span_XX(bounds_info_internalXX_template<T> * bounds,bounds_info_internal_t * bounds_nest)657 void kmp_calc_span_XX(
658 /* in/out*/ bounds_info_internalXX_template<T> *bounds,
659 /* in/out*/ bounds_info_internal_t *bounds_nest) {
660
661 if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
662 kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
663 } else {
664 KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
665 kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
666 }
667 }
668
669 //----------All initial processing of the loop nest---------------------------
670
671 // Calculate new bounds for this loop level.
672 // To be able to work with the nest we need to get it to a parallelepiped shape.
673 // We need to stay in the original range of values, so that there will be no
674 // overflow, for that we'll adjust both upper and lower bounds as needed.
675 template <typename T>
kmp_calc_new_bounds_XX(bounds_info_internalXX_template<T> * bounds,bounds_info_internal_t * bounds_nest)676 void kmp_calc_new_bounds_XX(
677 /* in/out*/ bounds_info_internalXX_template<T> *bounds,
678 /* in/out*/ bounds_info_internal_t *bounds_nest) {
679
680 auto &bbounds = bounds->b;
681
682 if (bbounds.lb1 == bbounds.ub1) {
683 // Already parallel, no need to adjust:
684 bounds->loop_bounds_adjusted = false;
685 } else {
686 bounds->loop_bounds_adjusted = true;
687
688 T old_lb1 = bbounds.lb1;
689 T old_ub1 = bbounds.ub1;
690
691 if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
692 // With this shape we can adjust to a rectangle:
693 bbounds.lb1 = 0;
694 bbounds.ub1 = 0;
695 } else {
696 // get upper and lower bounds to be parallel
697 // with values in the old range.
698 // Note: abs didn't work here.
699 if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||
700 ((old_lb1 > 0) && (old_lb1 > old_ub1))) {
701 bbounds.lb1 = old_ub1;
702 } else {
703 bbounds.ub1 = old_lb1;
704 }
705 }
706
707 // Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
708 // The idea here that for this IV we are now getting the same span
709 // irrespective of the previous IV value.
710 bounds_info_internalXX_template<T> *previous =
711 reinterpret_cast<bounds_info_internalXX_template<T> *>(
712 &bounds_nest[bbounds.outer_iv]);
713
714 if (bbounds.comparison == comparison_t::comp_less_or_eq) {
715 if (old_lb1 < bbounds.lb1) {
716 KMP_ASSERT(old_lb1 < 0);
717 // The length is good on outer_iv biggest number,
718 // can use it to find where to move the lower bound:
719
720 T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
721 bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?
722 // e.g. it was 0?? (same below)
723 } else if (old_lb1 > bbounds.lb1) {
724 // still need to move lower bound:
725 T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
726 bbounds.lb0 += add;
727 }
728
729 if (old_ub1 > bbounds.ub1) {
730 KMP_ASSERT(old_ub1 > 0);
731 // The length is good on outer_iv biggest number,
732 // can use it to find where to move upper bound:
733
734 T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
735 bbounds.ub0 += add;
736 } else if (old_ub1 < bbounds.ub1) {
737 // still need to move upper bound:
738 T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
739 bbounds.ub0 -= sub;
740 }
741 } else {
742 KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
743 if (old_lb1 < bbounds.lb1) {
744 KMP_ASSERT(old_lb1 < 0);
745 T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
746 bbounds.lb0 -= sub;
747 } else if (old_lb1 > bbounds.lb1) {
748 T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
749 bbounds.lb0 += add;
750 }
751
752 if (old_ub1 > bbounds.ub1) {
753 KMP_ASSERT(old_ub1 > 0);
754 T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
755 bbounds.ub0 += add;
756 } else if (old_ub1 < bbounds.ub1) {
757 T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
758 bbounds.ub0 -= sub;
759 }
760 }
761 }
762 }
763
764 // Do all processing for one canonicalized loop in the nest
765 // (assuming that outer loops already were processed):
766 template <typename T>
kmp_process_one_loop_XX(bounds_info_internalXX_template<T> * bounds,bounds_info_internal_t * bounds_nest)767 kmp_loop_nest_iv_t kmp_process_one_loop_XX(
768 /* in/out*/ bounds_info_internalXX_template<T> *bounds,
769 /*in/out*/ bounds_info_internal_t *bounds_nest) {
770
771 kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
772 kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
773 return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b));
774 }
775
776 // Non-rectangular loop nest, canonicalized to use <= or >=.
777 // Process loop nest to have a parallelepiped shape,
778 // calculate biggest spans for IV's on all levels and calculate overall trip
779 // count. "bounds_nest" has to be allocated per thread.
780 // Returns overall trip count (for adjusted space).
kmp_process_loop_nest(bounds_info_internal_t * bounds_nest,kmp_index_t n)781 kmp_loop_nest_iv_t kmp_process_loop_nest(
782 /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) {
783
784 kmp_loop_nest_iv_t total = 1;
785
786 for (kmp_index_t ind = 0; ind < n; ++ind) {
787 auto bounds = &(bounds_nest[ind]);
788 kmp_loop_nest_iv_t trip_count = 0;
789
790 switch (bounds->b.loop_type) {
791 case loop_type_t::loop_type_int32:
792 trip_count = kmp_process_one_loop_XX<kmp_int32>(
793 /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds),
794 /*in/out*/ bounds_nest);
795 break;
796 case loop_type_t::loop_type_uint32:
797 trip_count = kmp_process_one_loop_XX<kmp_uint32>(
798 /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
799 /*in/out*/ bounds_nest);
800 break;
801 case loop_type_t::loop_type_int64:
802 trip_count = kmp_process_one_loop_XX<kmp_int64>(
803 /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds),
804 /*in/out*/ bounds_nest);
805 break;
806 case loop_type_t::loop_type_uint64:
807 trip_count = kmp_process_one_loop_XX<kmp_uint64>(
808 /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
809 /*in/out*/ bounds_nest);
810 break;
811 default:
812 KMP_ASSERT(false);
813 }
814 total *= trip_count;
815 }
816
817 return total;
818 }
819
820 //----------Calculate iterations (in the original or updated space)-----------
821
822 // Calculate number of iterations in original or updated space resulting in
823 // original_ivs[ind] (only on this level, non-negative)
824 // (not counting initial iteration)
825 template <typename T>
826 kmp_loop_nest_iv_t
kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> * bounds,const kmp_point_t original_ivs,kmp_index_t ind)827 kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
828 const kmp_point_t original_ivs,
829 kmp_index_t ind) {
830
831 kmp_loop_nest_iv_t iterations = 0;
832
833 if (bounds->comparison == comparison_t::comp_less_or_eq) {
834 iterations =
835 (static_cast<T>(original_ivs[ind]) - bounds->lb0 -
836 bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
837 __kmp_abs(bounds->step);
838 } else {
839 KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
840 iterations = (bounds->lb0 +
841 bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
842 static_cast<T>(original_ivs[ind])) /
843 __kmp_abs(bounds->step);
844 }
845
846 return iterations;
847 }
848
849 // Calculate number of iterations in the original or updated space resulting in
850 // original_ivs[ind] (only on this level, non-negative)
kmp_calc_number_of_iterations(const bounds_info_t * bounds,const kmp_point_t original_ivs,kmp_index_t ind)851 kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,
852 const kmp_point_t original_ivs,
853 kmp_index_t ind) {
854
855 switch (bounds->loop_type) {
856 case loop_type_t::loop_type_int32:
857 return kmp_calc_number_of_iterations_XX<kmp_int32>(
858 (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
859 break;
860 case loop_type_t::loop_type_uint32:
861 return kmp_calc_number_of_iterations_XX<kmp_uint32>(
862 (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
863 break;
864 case loop_type_t::loop_type_int64:
865 return kmp_calc_number_of_iterations_XX<kmp_int64>(
866 (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
867 break;
868 case loop_type_t::loop_type_uint64:
869 return kmp_calc_number_of_iterations_XX<kmp_uint64>(
870 (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
871 break;
872 default:
873 KMP_ASSERT(false);
874 return 0;
875 }
876 }
877
878 //----------Calculate new iv corresponding to original ivs--------------------
879
880 // We got a point in the original loop nest.
881 // Take updated bounds and calculate what new_iv will correspond to this point.
882 // When we are getting original IVs from new_iv, we have to adjust to fit into
883 // original loops bounds. Getting new_iv for the adjusted original IVs will help
884 // with making more chunks non-empty.
885 kmp_loop_nest_iv_t
kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t * bounds_nest,const kmp_point_t original_ivs,kmp_index_t n)886 kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,
887 const kmp_point_t original_ivs,
888 kmp_index_t n) {
889
890 kmp_loop_nest_iv_t new_iv = 0;
891
892 for (kmp_index_t ind = 0; ind < n; ++ind) {
893 auto bounds = &(bounds_nest[ind].b);
894
895 new_iv = new_iv * bounds->trip_count +
896 kmp_calc_number_of_iterations(bounds, original_ivs, ind);
897 }
898
899 return new_iv;
900 }
901
902 //----------Calculate original ivs for provided iterations--------------------
903
904 // Calculate original IVs for provided iterations, assuming iterations are
905 // calculated in the original space.
906 // Loop nest is in canonical form (with <= / >=).
kmp_calc_original_ivs_from_iterations(const bounds_info_t * original_bounds_nest,kmp_index_t n,kmp_point_t original_ivs,kmp_iterations_t iterations,kmp_index_t ind)907 bool kmp_calc_original_ivs_from_iterations(
908 const bounds_info_t *original_bounds_nest, kmp_index_t n,
909 /*in/out*/ kmp_point_t original_ivs,
910 /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) {
911
912 kmp_index_t lengthened_ind = n;
913
914 for (; ind < n;) {
915 auto bounds = &(original_bounds_nest[ind]);
916 bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations,
917 ind, (lengthened_ind < ind), true);
918
919 if (!good) {
920 // The calculated iv value is too big (or too small for >=):
921 if (ind == 0) {
922 // Space is empty:
923 return false;
924 } else {
925 // Go to next iteration on the outer loop:
926 --ind;
927 ++iterations[ind];
928 lengthened_ind = ind;
929 for (kmp_index_t i = ind + 1; i < n; ++i) {
930 iterations[i] = 0;
931 }
932 continue;
933 }
934 }
935 ++ind;
936 }
937
938 return true;
939 }
940
941 //----------Calculate original ivs for the beginning of the loop nest---------
942
943 // Calculate IVs for the beginning of the loop nest.
944 // Note: lower bounds of all loops may not work -
945 // if on some of the iterations of the outer loops inner loops are empty.
946 // Loop nest is in canonical form (with <= / >=).
kmp_calc_original_ivs_for_start(const bounds_info_t * original_bounds_nest,kmp_index_t n,kmp_point_t original_ivs)947 bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
948 kmp_index_t n,
949 /*out*/ kmp_point_t original_ivs) {
950
951 // Iterations in the original space, multiplied by step:
952 CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
953 for (kmp_index_t ind = n; ind > 0;) {
954 --ind;
955 iterations[ind] = 0;
956 }
957
958 // Now calculate the point:
959 bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
960 /*in/out*/ original_ivs,
961 /*in/out*/ iterations, 0);
962 return b;
963 }
964
965 //----------Calculate next point in the original loop space-------------------
966
967 // From current set of original IVs calculate next point.
968 // Return false if there is no next point in the loop bounds.
kmp_calc_next_original_ivs(const bounds_info_t * original_bounds_nest,kmp_index_t n,const kmp_point_t original_ivs,kmp_point_t next_original_ivs)969 bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
970 kmp_index_t n, const kmp_point_t original_ivs,
971 /*out*/ kmp_point_t next_original_ivs) {
972 // Iterations in the original space, multiplied by step (so can be negative):
973 CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
974 // First, calc corresponding iteration in every original loop:
975 for (kmp_index_t ind = 0; ind < n; ++ind) {
976 auto bounds = &(original_bounds_nest[ind]);
977 iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
978 }
979
980 for (kmp_index_t ind = 0; ind < n; ++ind) {
981 next_original_ivs[ind] = original_ivs[ind];
982 }
983
984 // Next add one step to the iterations on the inner-most level, and see if we
985 // need to move up the nest:
986 kmp_index_t ind = n - 1;
987 ++iterations[ind];
988
989 bool b = kmp_calc_original_ivs_from_iterations(
990 original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);
991
992 return b;
993 }
994
995 //----------Calculate chunk end in the original loop space--------------------
996
997 // For one level calculate old induction variable corresponding to overall
998 // new_iv for the chunk end.
999 // Return true if it fits into upper bound on this level
1000 // (if not, we need to re-calculate)
1001 template <typename T>
kmp_calc_one_iv_for_chunk_end_XX(const bounds_infoXX_template<T> * bounds,const bounds_infoXX_template<T> * updated_bounds,kmp_point_t original_ivs,const kmp_iterations_t iterations,kmp_index_t ind,bool start_with_lower_bound,bool compare_with_start,const kmp_point_t original_ivs_start)1002 bool kmp_calc_one_iv_for_chunk_end_XX(
1003 const bounds_infoXX_template<T> *bounds,
1004 const bounds_infoXX_template<T> *updated_bounds,
1005 /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations,
1006 kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
1007 const kmp_point_t original_ivs_start) {
1008
1009 // typedef std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
1010 // big_span_t;
1011
1012 // OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
1013 T temp = 0;
1014
1015 T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
1016
1017 if (start_with_lower_bound) {
1018 // we moved to the next iteration on one of outer loops, may as well use
1019 // the lower bound here:
1020 temp = bounds->lb0 + bounds->lb1 * outer_iv;
1021 } else {
1022 // Start in expanded space, but:
1023 // - we need to hit original space lower bound, so need to account for
1024 // that
1025 // - we have to go into original space, even if that means adding more
1026 // iterations than was planned
1027 // - we have to go past (or equal to) previous point (which is the chunk
1028 // starting point)
1029
1030 auto iteration = iterations[ind];
1031
1032 auto step = bounds->step;
1033
1034 // In case of >= it's negative:
1035 auto accountForStep =
1036 ((bounds->lb0 + bounds->lb1 * outer_iv) -
1037 (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
1038 step;
1039
1040 temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
1041 accountForStep + iteration * step;
1042
1043 if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1044 (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
1045 ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1046 (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
1047 // Too small (or too big), didn't reach the original lower bound. Use
1048 // heuristic:
1049 temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;
1050 }
1051
1052 if (compare_with_start) {
1053
1054 T start = static_cast<T>(original_ivs_start[ind]);
1055
1056 temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1057
1058 // On all previous levels start of the chunk is same as the end, need to
1059 // be really careful here:
1060 if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1061 (temp < start)) ||
1062 ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1063 (temp > start))) {
1064 // End of the chunk can't be smaller (for >= bigger) than it's start.
1065 // Use heuristic:
1066 temp = start + iteration / 4 * step;
1067 }
1068 }
1069 }
1070
1071 original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1072
1073 if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1074 (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
1075 ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1076 (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
1077 // Too big (or too small for >=).
1078 return false;
1079 }
1080
1081 return true;
1082 }
1083
1084 // For one level calculate old induction variable corresponding to overall
1085 // new_iv for the chunk end.
kmp_calc_one_iv_for_chunk_end(const bounds_info_t * bounds,const bounds_info_t * updated_bounds,kmp_point_t original_ivs,const kmp_iterations_t iterations,kmp_index_t ind,bool start_with_lower_bound,bool compare_with_start,const kmp_point_t original_ivs_start)1086 bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
1087 const bounds_info_t *updated_bounds,
1088 /*in/out*/ kmp_point_t original_ivs,
1089 const kmp_iterations_t iterations,
1090 kmp_index_t ind, bool start_with_lower_bound,
1091 bool compare_with_start,
1092 const kmp_point_t original_ivs_start) {
1093
1094 switch (bounds->loop_type) {
1095 case loop_type_t::loop_type_int32:
1096 return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
1097 (bounds_infoXX_template<kmp_int32> *)(bounds),
1098 (bounds_infoXX_template<kmp_int32> *)(updated_bounds),
1099 /*in/out*/
1100 original_ivs, iterations, ind, start_with_lower_bound,
1101 compare_with_start, original_ivs_start);
1102 break;
1103 case loop_type_t::loop_type_uint32:
1104 return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
1105 (bounds_infoXX_template<kmp_uint32> *)(bounds),
1106 (bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
1107 /*in/out*/
1108 original_ivs, iterations, ind, start_with_lower_bound,
1109 compare_with_start, original_ivs_start);
1110 break;
1111 case loop_type_t::loop_type_int64:
1112 return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
1113 (bounds_infoXX_template<kmp_int64> *)(bounds),
1114 (bounds_infoXX_template<kmp_int64> *)(updated_bounds),
1115 /*in/out*/
1116 original_ivs, iterations, ind, start_with_lower_bound,
1117 compare_with_start, original_ivs_start);
1118 break;
1119 case loop_type_t::loop_type_uint64:
1120 return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
1121 (bounds_infoXX_template<kmp_uint64> *)(bounds),
1122 (bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
1123 /*in/out*/
1124 original_ivs, iterations, ind, start_with_lower_bound,
1125 compare_with_start, original_ivs_start);
1126 break;
1127 default:
1128 KMP_ASSERT(false);
1129 return false;
1130 }
1131 }
1132
1133 // Calculate old induction variables corresponding to overall new_iv for the
1134 // chunk end. If due to space extension we are getting old IVs outside of the
1135 // boundaries, bring them into the boundaries. Need to do this in the runtime,
1136 // esp. on the lower bounds side. When getting result need to make sure that the
1137 // new chunk starts at next position to old chunk, not overlaps with it (this is
1138 // done elsewhere), and need to make sure end of the chunk is further than the
1139 // beginning of the chunk. We don't need an exact ending point here, just
1140 // something more-or-less close to the desired chunk length, bigger is fine
1141 // (smaller would be fine, but we risk going into infinite loop, so do smaller
1142 // only at the very end of the space). result: false if could not find the
1143 // ending point in the original loop space. In this case the caller can use
1144 // original upper bounds as the end of the chunk. Chunk won't be empty, because
1145 // it'll have at least the starting point, which is by construction in the
1146 // original space.
kmp_calc_original_ivs_for_chunk_end(const bounds_info_t * original_bounds_nest,kmp_index_t n,const bounds_info_internal_t * updated_bounds_nest,const kmp_point_t original_ivs_start,kmp_loop_nest_iv_t new_iv,kmp_point_t original_ivs)1147 bool kmp_calc_original_ivs_for_chunk_end(
1148 const bounds_info_t *original_bounds_nest, kmp_index_t n,
1149 const bounds_info_internal_t *updated_bounds_nest,
1150 const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
1151 /*out*/ kmp_point_t original_ivs) {
1152
1153 // Iterations in the expanded space:
1154 CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
1155 // First, calc corresponding iteration in every modified loop:
1156 for (kmp_index_t ind = n; ind > 0;) {
1157 --ind;
1158 auto &updated_bounds = updated_bounds_nest[ind];
1159
1160 // should be optimized to OPDIVREM:
1161 auto new_ind = new_iv / updated_bounds.b.trip_count;
1162 auto iteration = new_iv % updated_bounds.b.trip_count;
1163
1164 new_iv = new_ind;
1165 iterations[ind] = iteration;
1166 }
1167 KMP_DEBUG_ASSERT(new_iv == 0);
1168
1169 kmp_index_t lengthened_ind = n;
1170 kmp_index_t equal_ind = -1;
1171
1172 // Next calculate the point, but in original loop nest.
1173 for (kmp_index_t ind = 0; ind < n;) {
1174 auto bounds = &(original_bounds_nest[ind]);
1175 auto updated_bounds = &(updated_bounds_nest[ind].b);
1176
1177 bool good = kmp_calc_one_iv_for_chunk_end(
1178 bounds, updated_bounds,
1179 /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind),
1180 (equal_ind >= ind - 1), original_ivs_start);
1181
1182 if (!good) {
1183 // Too big (or too small for >=).
1184 if (ind == 0) {
1185 // Need to reduce to the end.
1186 return false;
1187 } else {
1188 // Go to next iteration on outer loop:
1189 --ind;
1190 ++(iterations[ind]);
1191 lengthened_ind = ind;
1192 if (equal_ind >= lengthened_ind) {
1193 // We've changed the number of iterations here,
1194 // can't be same anymore:
1195 equal_ind = lengthened_ind - 1;
1196 }
1197 for (kmp_index_t i = ind + 1; i < n; ++i) {
1198 iterations[i] = 0;
1199 }
1200 continue;
1201 }
1202 }
1203
1204 if ((equal_ind == ind - 1) &&
1205 (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1206 original_ivs_start[ind]))) {
1207 equal_ind = ind;
1208 } else if ((equal_ind > ind - 1) &&
1209 !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1210 original_ivs_start[ind]))) {
1211 equal_ind = ind - 1;
1212 }
1213 ++ind;
1214 }
1215
1216 return true;
1217 }
1218
1219 //----------Calculate upper bounds for the last chunk-------------------------
1220
1221 // Calculate one upper bound for the end.
1222 template <typename T>
kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> * bounds,kmp_point_t original_ivs,kmp_index_t ind)1223 void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
1224 /*in/out*/ kmp_point_t original_ivs,
1225 kmp_index_t ind) {
1226
1227 T temp = bounds->ub0 +
1228 bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);
1229
1230 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
1231 }
1232
kmp_calc_one_iv_end(const bounds_info_t * bounds,kmp_point_t original_ivs,kmp_index_t ind)1233 void kmp_calc_one_iv_end(const bounds_info_t *bounds,
1234 /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) {
1235
1236 switch (bounds->loop_type) {
1237 default:
1238 KMP_ASSERT(false);
1239 break;
1240 case loop_type_t::loop_type_int32:
1241 kmp_calc_one_iv_end_XX<kmp_int32>(
1242 (bounds_infoXX_template<kmp_int32> *)(bounds),
1243 /*in/out*/ original_ivs, ind);
1244 break;
1245 case loop_type_t::loop_type_uint32:
1246 kmp_calc_one_iv_end_XX<kmp_uint32>(
1247 (bounds_infoXX_template<kmp_uint32> *)(bounds),
1248 /*in/out*/ original_ivs, ind);
1249 break;
1250 case loop_type_t::loop_type_int64:
1251 kmp_calc_one_iv_end_XX<kmp_int64>(
1252 (bounds_infoXX_template<kmp_int64> *)(bounds),
1253 /*in/out*/ original_ivs, ind);
1254 break;
1255 case loop_type_t::loop_type_uint64:
1256 kmp_calc_one_iv_end_XX<kmp_uint64>(
1257 (bounds_infoXX_template<kmp_uint64> *)(bounds),
1258 /*in/out*/ original_ivs, ind);
1259 break;
1260 }
1261 }
1262
1263 // Calculate upper bounds for the last loop iteration. Just use original upper
1264 // bounds (adjusted when canonicalized to use <= / >=). No need to check that
1265 // this point is in the original space (it's likely not)
kmp_calc_original_ivs_for_end(const bounds_info_t * const original_bounds_nest,kmp_index_t n,kmp_point_t original_ivs)1266 void kmp_calc_original_ivs_for_end(
1267 const bounds_info_t *const original_bounds_nest, kmp_index_t n,
1268 /*out*/ kmp_point_t original_ivs) {
1269 for (kmp_index_t ind = 0; ind < n; ++ind) {
1270 auto bounds = &(original_bounds_nest[ind]);
1271 kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind);
1272 }
1273 }
1274
1275 /**************************************************************************
1276 * Identify nested loop structure - loops come in the canonical form
1277 * Lower triangle matrix: i = 0; i <= N; i++ {0,0}:{N,0}
1278 * j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1}
1279 * Upper Triangle matrix
1280 * i = 0; i <= N; i++ {0,0}:{N,0}
1281 * j = 0+1*i; j <= N; j++ {0,1}:{N,0}
1282 * ************************************************************************/
1283 nested_loop_type_t
kmp_identify_nested_loop_structure(bounds_info_t * original_bounds_nest,kmp_index_t n)1284 kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest,
1285 /*in*/ kmp_index_t n) {
1286 // only 2-level nested loops are supported
1287 if (n != 2) {
1288 return nested_loop_type_unkown;
1289 }
1290 // loops must be canonical
1291 KMP_ASSERT(
1292 (original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) &&
1293 (original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq));
1294 // check outer loop bounds: for triangular need to be {0,0}:{N,0}
1295 kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1296 original_bounds_nest[0].lb0_u64);
1297 kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1298 original_bounds_nest[0].ub0_u64);
1299 kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1300 original_bounds_nest[0].lb1_u64);
1301 kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1302 original_bounds_nest[0].ub1_u64);
1303 if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) {
1304 return nested_loop_type_unkown;
1305 }
1306 // check inner bounds to determine triangle type
1307 kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1308 original_bounds_nest[1].lb0_u64);
1309 kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1310 original_bounds_nest[1].ub0_u64);
1311 kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1312 original_bounds_nest[1].lb1_u64);
1313 kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1314 original_bounds_nest[1].ub1_u64);
1315 // lower triangle loop inner bounds need to be {0,0}:{0/-1,1}
1316 if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 &&
1317 (inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) {
1318 return nested_loop_type_lower_triangular_matrix;
1319 }
1320 // upper triangle loop inner bounds need to be {0,1}:{N,0}
1321 if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 &&
1322 inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) {
1323 return nested_loop_type_upper_triangular_matrix;
1324 }
1325 return nested_loop_type_unkown;
1326 }
1327
1328 /**************************************************************************
1329 * SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf
1330 * Start point is x so the result is always > sqrt(x)
1331 * The method has uniform convergence, PRECISION is set to 0.1
1332 * ************************************************************************/
1333 #define level_of_precision 0.1
sqrt_newton_approx(kmp_uint64 x)1334 double sqrt_newton_approx(/*in*/ kmp_uint64 x) {
1335 double sqrt_old = 0.;
1336 double sqrt_new = (double)x;
1337 do {
1338 sqrt_old = sqrt_new;
1339 sqrt_new = (sqrt_old + x / sqrt_old) / 2;
1340 } while ((sqrt_old - sqrt_new) > level_of_precision);
1341 return sqrt_new;
1342 }
1343
1344 /**************************************************************************
1345 * Handle lower triangle matrix in the canonical form
1346 * i = 0; i <= N; i++ {0,0}:{N,0}
1347 * j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1}
1348 * ************************************************************************/
kmp_handle_lower_triangle_matrix(kmp_uint32 nth,kmp_uint32 tid,kmp_index_t n,bounds_info_t * original_bounds_nest,bounds_info_t * chunk_bounds_nest)1349 void kmp_handle_lower_triangle_matrix(
1350 /*in*/ kmp_uint32 nth,
1351 /*in*/ kmp_uint32 tid,
1352 /*in */ kmp_index_t n,
1353 /*in/out*/ bounds_info_t *original_bounds_nest,
1354 /*out*/ bounds_info_t *chunk_bounds_nest) {
1355
1356 // transfer loop types from the original loop to the chunks
1357 for (kmp_index_t i = 0; i < n; ++i) {
1358 chunk_bounds_nest[i] = original_bounds_nest[i];
1359 }
1360 // cleanup iv variables
1361 kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1362 original_bounds_nest[0].ub0_u64);
1363 kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1364 original_bounds_nest[0].lb0_u64);
1365 kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1366 original_bounds_nest[1].ub0_u64);
1367 // calculate the chunk's lower and upper bounds
1368 // the total number of iterations in the loop is the sum of the arithmetic
1369 // progression from the outer lower to outer upper bound (inclusive since the
1370 // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1371 // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1372 // + 1) -> N - 1
1373 kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0;
1374 kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
1375 // the current thread's number of iterations:
1376 // each thread gets an equal number of iterations: total number of iterations
1377 // divided by the number of threads plus, if there's a remainder,
1378 // the first threads with the number up to the remainder get an additional
1379 // iteration each to cover it
1380 kmp_uint64 iter_current =
1381 iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
1382 // cumulative number of iterations executed by all the previous threads:
1383 // threads with the tid below the remainder will have (iter_total/nth+1)
1384 // elements, and so will all threads before them so the cumulative number of
1385 // iterations executed by the all previous will be the current thread's number
1386 // of iterations multiplied by the number of previous threads which is equal
1387 // to the current thread's tid; threads with the number equal or above the
1388 // remainder will have (iter_total/nth) elements so the cumulative number of
1389 // iterations previously executed is its number of iterations multipled by the
1390 // number of previous threads which is again equal to the current thread's tid
1391 // PLUS all the remainder iterations that will have been executed by the
1392 // previous threads
1393 kmp_uint64 iter_before_current =
1394 tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1395 // cumulative number of iterations executed with the current thread is
1396 // the cumulative number executed before it plus its own
1397 kmp_uint64 iter_with_current = iter_before_current + iter_current;
1398 // calculate the outer loop lower bound (lbo) which is the max outer iv value
1399 // that gives the number of iterations that is equal or just below the total
1400 // number of iterations executed by the previous threads, for less_than
1401 // (1-based) inner loops (inner_ub0 == -1) it will be i.e.
1402 // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
1403 // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
1404 // i.e. lbo*(lbo+1)/2<=iter_before_current =>
1405 // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
1406 // using a parameter to control the equation sign
1407 kmp_int64 inner_adjustment = 1 + 2 * inner_ub0;
1408 kmp_uint64 lower_bound_outer =
1409 (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
1410 8 * iter_before_current) +
1411 inner_adjustment) /
1412 2 -
1413 inner_adjustment;
1414 // calculate the inner loop lower bound which is the remaining number of
1415 // iterations required to hit the total number of iterations executed by the
1416 // previous threads giving the starting point of this thread
1417 kmp_uint64 lower_bound_inner =
1418 iter_before_current -
1419 ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2;
1420 // calculate the outer loop upper bound using the same approach as for the
1421 // inner bound except using the total number of iterations executed with the
1422 // current thread
1423 kmp_uint64 upper_bound_outer =
1424 (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
1425 8 * iter_with_current) +
1426 inner_adjustment) /
1427 2 -
1428 inner_adjustment;
1429 // calculate the inner loop upper bound which is the remaining number of
1430 // iterations required to hit the total number of iterations executed after
1431 // the current thread giving the starting point of the next thread
1432 kmp_uint64 upper_bound_inner =
1433 iter_with_current -
1434 ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2;
1435 // adjust the upper bounds down by 1 element to point at the last iteration of
1436 // the current thread the first iteration of the next thread
1437 if (upper_bound_inner == 0) {
1438 // {n,0} => {n-1,n-1}
1439 upper_bound_outer -= 1;
1440 upper_bound_inner = upper_bound_outer;
1441 } else {
1442 // {n,m} => {n,m-1} (m!=0)
1443 upper_bound_inner -= 1;
1444 }
1445
1446 // assign the values, zeroing out lb1 and ub1 values since the iteration space
1447 // is now one-dimensional
1448 chunk_bounds_nest[0].lb0_u64 = lower_bound_outer;
1449 chunk_bounds_nest[1].lb0_u64 = lower_bound_inner;
1450 chunk_bounds_nest[0].ub0_u64 = upper_bound_outer;
1451 chunk_bounds_nest[1].ub0_u64 = upper_bound_inner;
1452 chunk_bounds_nest[0].lb1_u64 = 0;
1453 chunk_bounds_nest[0].ub1_u64 = 0;
1454 chunk_bounds_nest[1].lb1_u64 = 0;
1455 chunk_bounds_nest[1].ub1_u64 = 0;
1456
1457 #if 0
1458 printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1459 tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1460 chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1461 #endif
1462 }
1463
1464 /**************************************************************************
1465 * Handle upper triangle matrix in the canonical form
1466 * i = 0; i <= N; i++ {0,0}:{N,0}
1467 * j = 0+1*i; j <= N; j++ {0,1}:{N,0}
1468 * ************************************************************************/
kmp_handle_upper_triangle_matrix(kmp_uint32 nth,kmp_uint32 tid,kmp_index_t n,bounds_info_t * original_bounds_nest,bounds_info_t * chunk_bounds_nest)1469 void kmp_handle_upper_triangle_matrix(
1470 /*in*/ kmp_uint32 nth,
1471 /*in*/ kmp_uint32 tid,
1472 /*in */ kmp_index_t n,
1473 /*in/out*/ bounds_info_t *original_bounds_nest,
1474 /*out*/ bounds_info_t *chunk_bounds_nest) {
1475
1476 // transfer loop types from the original loop to the chunks
1477 for (kmp_index_t i = 0; i < n; ++i) {
1478 chunk_bounds_nest[i] = original_bounds_nest[i];
1479 }
1480 // cleanup iv variables
1481 kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1482 original_bounds_nest[0].ub0_u64);
1483 kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1484 original_bounds_nest[0].lb0_u64);
1485 [[maybe_unused]] kmp_uint64 inner_ub0 = kmp_fix_iv(
1486 original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub0_u64);
1487 // calculate the chunk's lower and upper bounds
1488 // the total number of iterations in the loop is the sum of the arithmetic
1489 // progression from the outer lower to outer upper bound (inclusive since the
1490 // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1491 // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1492 // + 1) -> N - 1
1493 kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1);
1494 kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
1495 // the current thread's number of iterations:
1496 // each thread gets an equal number of iterations: total number of iterations
1497 // divided by the number of threads plus, if there's a remainder,
1498 // the first threads with the number up to the remainder get an additional
1499 // iteration each to cover it
1500 kmp_uint64 iter_current =
1501 iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
1502 // cumulative number of iterations executed by all the previous threads:
1503 // threads with the tid below the remainder will have (iter_total/nth+1)
1504 // elements, and so will all threads before them so the cumulative number of
1505 // iterations executed by the all previous will be the current thread's number
1506 // of iterations multiplied by the number of previous threads which is equal
1507 // to the current thread's tid; threads with the number equal or above the
1508 // remainder will have (iter_total/nth) elements so the cumulative number of
1509 // iterations previously executed is its number of iterations multipled by the
1510 // number of previous threads which is again equal to the current thread's tid
1511 // PLUS all the remainder iterations that will have been executed by the
1512 // previous threads
1513 kmp_uint64 iter_before_current =
1514 tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1515 // cumulative number of iterations executed with the current thread is
1516 // the cumulative number executed before it plus its own
1517 kmp_uint64 iter_with_current = iter_before_current + iter_current;
1518 // calculate the outer loop lower bound (lbo) which is the max outer iv value
1519 // that gives the number of iterations that is equal or just below the total
1520 // number of iterations executed by the previous threads:
1521 // lbo*(lbo+1)/2<=iter_before_current =>
1522 // lbo^2+lbo-2*iter_before_current<=0
1523 kmp_uint64 lower_bound_outer =
1524 (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;
1525 // calculate the inner loop lower bound which is the remaining number of
1526 // iterations required to hit the total number of iterations executed by the
1527 // previous threads giving the starting point of this thread
1528 kmp_uint64 lower_bound_inner =
1529 iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2;
1530 // calculate the outer loop upper bound using the same approach as for the
1531 // inner bound except using the total number of iterations executed with the
1532 // current thread
1533 kmp_uint64 upper_bound_outer =
1534 (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1;
1535 // calculate the inner loop upper bound which is the remaining number of
1536 // iterations required to hit the total number of iterations executed after
1537 // the current thread giving the starting point of the next thread
1538 kmp_uint64 upper_bound_inner =
1539 iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2;
1540 // adjust the upper bounds down by 1 element to point at the last iteration of
1541 // the current thread the first iteration of the next thread
1542 if (upper_bound_inner == 0) {
1543 // {n,0} => {n-1,n-1}
1544 upper_bound_outer -= 1;
1545 upper_bound_inner = upper_bound_outer;
1546 } else {
1547 // {n,m} => {n,m-1} (m!=0)
1548 upper_bound_inner -= 1;
1549 }
1550
1551 // assign the values, zeroing out lb1 and ub1 values since the iteration space
1552 // is now one-dimensional
1553 chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer;
1554 chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner;
1555 chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer;
1556 chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner;
1557 chunk_bounds_nest[0].lb1_u64 = 0;
1558 chunk_bounds_nest[0].ub1_u64 = 0;
1559 chunk_bounds_nest[1].lb1_u64 = 0;
1560 chunk_bounds_nest[1].ub1_u64 = 0;
1561
1562 #if 0
1563 printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1564 tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1565 chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1566 #endif
1567 }
1568 //----------Init API for non-rectangular loops--------------------------------
1569
1570 // Init API for collapsed loops (static, no chunks defined).
1571 // "bounds_nest" has to be allocated per thread.
1572 // API will modify original bounds_nest array to bring it to a canonical form
1573 // (only <= and >=, no !=, <, >). If the original loop nest was already in a
1574 // canonical form there will be no changes to bounds in bounds_nest array
1575 // (only trip counts will be calculated). Internally API will expand the space
1576 // to parallelogram/parallelepiped, calculate total, calculate bounds for the
1577 // chunks in terms of the new IV, re-calc them in terms of old IVs (especially
1578 // important on the left side, to hit the lower bounds and not step over), and
1579 // pick the correct chunk for this thread (so it will calculate chunks up to the
1580 // needed one). It could be optimized to calculate just this chunk, potentially
1581 // a bit less well distributed among threads. It is designed to make sure that
1582 // threads will receive predictable chunks, deterministically (so that next nest
1583 // of loops with similar characteristics will get exactly same chunks on same
1584 // threads).
1585 // Current contract: chunk_bounds_nest has only lb0 and ub0,
1586 // lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
1587 extern "C" kmp_int32
__kmpc_for_collapsed_init(ident_t * loc,kmp_int32 gtid,bounds_info_t * original_bounds_nest,bounds_info_t * chunk_bounds_nest,kmp_index_t n,kmp_int32 * plastiter)1588 __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1589 /*in/out*/ bounds_info_t *original_bounds_nest,
1590 /*out*/ bounds_info_t *chunk_bounds_nest,
1591 kmp_index_t n, /*out*/ kmp_int32 *plastiter) {
1592
1593 KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
1594 KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));
1595
1596 if (__kmp_env_consistency_check) {
1597 __kmp_push_workshare(gtid, ct_pdo, loc);
1598 }
1599
1600 kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
1601
1602 CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n);
1603
1604 for (kmp_index_t i = 0; i < n; ++i) {
1605 updated_bounds_nest[i].b = original_bounds_nest[i];
1606 }
1607
1608 kmp_loop_nest_iv_t total =
1609 kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n);
1610
1611 if (plastiter != NULL) {
1612 *plastiter = FALSE;
1613 }
1614
1615 if (total == 0) {
1616 // Loop won't execute:
1617 return FALSE;
1618 }
1619
1620 // OMPTODO: DISTRIBUTE is not supported yet
1621 __kmp_assert_valid_gtid(gtid);
1622 kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
1623
1624 kmp_info_t *th = __kmp_threads[gtid];
1625 kmp_team_t *team = th->th.th_team;
1626 kmp_uint32 nth = team->t.t_nproc; // Number of threads
1627
1628 KMP_DEBUG_ASSERT(tid < nth);
1629
1630 // Handle special cases
1631 nested_loop_type_t loop_type =
1632 kmp_identify_nested_loop_structure(original_bounds_nest, n);
1633 if (loop_type == nested_loop_type_lower_triangular_matrix) {
1634 kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest,
1635 chunk_bounds_nest);
1636 return TRUE;
1637 } else if (loop_type == nested_loop_type_upper_triangular_matrix) {
1638 kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest,
1639 chunk_bounds_nest);
1640 return TRUE;
1641 }
1642
1643 CollapseAllocator<kmp_uint64> original_ivs_start(n);
1644
1645 if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
1646 /*out*/ original_ivs_start)) {
1647 // Loop won't execute:
1648 return FALSE;
1649 }
1650
1651 // Not doing this optimization for one thread:
1652 // (1) more to test
1653 // (2) without it current contract that chunk_bounds_nest has only lb0 and
1654 // ub0, lb1 and ub1 are set to 0 and can be ignored.
1655 // if (nth == 1) {
1656 // // One thread:
1657 // // Copy all info from original_bounds_nest, it'll be good enough.
1658
1659 // for (kmp_index_t i = 0; i < n; ++i) {
1660 // chunk_bounds_nest[i] = original_bounds_nest[i];
1661 // }
1662
1663 // if (plastiter != NULL) {
1664 // *plastiter = TRUE;
1665 // }
1666 // return TRUE;
1667 //}
1668
1669 kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
1670 updated_bounds_nest, original_ivs_start, n);
1671
1672 bool last_iter = false;
1673
1674 for (; nth > 0;) {
1675 // We could calculate chunk size once, but this is to compensate that the
1676 // original space is not parallelepiped and some threads can be left
1677 // without work:
1678 KMP_DEBUG_ASSERT(total >= new_iv);
1679
1680 kmp_loop_nest_iv_t total_left = total - new_iv;
1681 kmp_loop_nest_iv_t chunk_size = total_left / nth;
1682 kmp_loop_nest_iv_t remainder = total_left % nth;
1683
1684 kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
1685
1686 if (remainder > 0) {
1687 ++curr_chunk_size;
1688 --remainder;
1689 }
1690
1691 #if defined(KMP_DEBUG)
1692 kmp_loop_nest_iv_t new_iv_for_start = new_iv;
1693 #endif
1694
1695 if (curr_chunk_size > 1) {
1696 new_iv += curr_chunk_size - 1;
1697 }
1698
1699 CollapseAllocator<kmp_uint64> original_ivs_end(n);
1700 if ((nth == 1) || (new_iv >= total - 1)) {
1701 // Do this one till the end - just in case we miscalculated
1702 // and either too much is left to process or new_iv is a bit too big:
1703 kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1704 /*out*/ original_ivs_end);
1705
1706 last_iter = true;
1707 } else {
1708 // Note: here we make sure it's past (or equal to) the previous point.
1709 if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
1710 updated_bounds_nest,
1711 original_ivs_start, new_iv,
1712 /*out*/ original_ivs_end)) {
1713 // We could not find the ending point, use the original upper bounds:
1714 kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1715 /*out*/ original_ivs_end);
1716
1717 last_iter = true;
1718 }
1719 }
1720
1721 #if defined(KMP_DEBUG)
1722 auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
1723 updated_bounds_nest, original_ivs_end, n);
1724 KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
1725 #endif
1726
1727 if (last_iter && (tid != 0)) {
1728 // We are done, this was last chunk, but no chunk for current thread was
1729 // found:
1730 return FALSE;
1731 }
1732
1733 if (tid == 0) {
1734 // We found the chunk for this thread, now we need to check if it's the
1735 // last chunk or not:
1736
1737 CollapseAllocator<kmp_uint64> original_ivs_next_start(n);
1738 if (last_iter ||
1739 !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
1740 /*out*/ original_ivs_next_start)) {
1741 // no more loop iterations left to process,
1742 // this means that currently found chunk is the last chunk:
1743 if (plastiter != NULL) {
1744 *plastiter = TRUE;
1745 }
1746 }
1747
1748 // Fill in chunk bounds:
1749 for (kmp_index_t i = 0; i < n; ++i) {
1750 chunk_bounds_nest[i] =
1751 original_bounds_nest[i]; // To fill in types, etc. - optional
1752 chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
1753 chunk_bounds_nest[i].lb1_u64 = 0;
1754
1755 chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
1756 chunk_bounds_nest[i].ub1_u64 = 0;
1757 }
1758
1759 return TRUE;
1760 }
1761
1762 --tid;
1763 --nth;
1764
1765 bool next_chunk = kmp_calc_next_original_ivs(
1766 original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start);
1767 if (!next_chunk) {
1768 // no more loop iterations to process,
1769 // the prevoius chunk was the last chunk
1770 break;
1771 }
1772
1773 // original_ivs_start is next to previous chunk original_ivs_end,
1774 // we need to start new chunk here, so chunks will be one after another
1775 // without any gap or overlap:
1776 new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
1777 original_ivs_start, n);
1778 }
1779
1780 return FALSE;
1781 }
1782