xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_sched.cpp (revision 9f23cbd6cae82fd77edfad7173432fa8dccd0a95)
1 /*
2  * kmp_sched.cpp -- static scheduling -- iteration initialization
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Static scheduling initialization.
14 
15   NOTE: team->t.t_nproc is a constant inside of any dispatch loop, however
16         it may change values between parallel regions.  __kmp_max_nth
17         is the largest value __kmp_nth may take, 1 is the smallest. */
18 
19 #include "kmp.h"
20 #include "kmp_error.h"
21 #include "kmp_i18n.h"
22 #include "kmp_itt.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 
26 #if OMPT_SUPPORT
27 #include "ompt-specific.h"
28 #endif
29 
30 #ifdef KMP_DEBUG
31 //-------------------------------------------------------------------------
32 // template for debug prints specification ( d, u, lld, llu )
33 char const *traits_t<int>::spec = "d";
34 char const *traits_t<unsigned int>::spec = "u";
35 char const *traits_t<long long>::spec = "lld";
36 char const *traits_t<unsigned long long>::spec = "llu";
37 char const *traits_t<long>::spec = "ld";
38 //-------------------------------------------------------------------------
39 #endif
40 
41 #if KMP_STATS_ENABLED
42 #define KMP_STATS_LOOP_END(stat)                                               \
43   {                                                                            \
44     kmp_int64 t;                                                               \
45     kmp_int64 u = (kmp_int64)(*pupper);                                        \
46     kmp_int64 l = (kmp_int64)(*plower);                                        \
47     kmp_int64 i = (kmp_int64)incr;                                             \
48     if (i == 1) {                                                              \
49       t = u - l + 1;                                                           \
50     } else if (i == -1) {                                                      \
51       t = l - u + 1;                                                           \
52     } else if (i > 0) {                                                        \
53       t = (u - l) / i + 1;                                                     \
54     } else {                                                                   \
55       t = (l - u) / (-i) + 1;                                                  \
56     }                                                                          \
57     KMP_COUNT_VALUE(stat, t);                                                  \
58     KMP_POP_PARTITIONED_TIMER();                                               \
59   }
60 #else
61 #define KMP_STATS_LOOP_END(stat) /* Nothing */
62 #endif
63 
64 static ident_t loc_stub = {0, KMP_IDENT_KMPC, 0, 0, ";unknown;unknown;0;0;;"};
65 static inline void check_loc(ident_t *&loc) {
66   if (loc == NULL)
67     loc = &loc_stub; // may need to report location info to ittnotify
68 }
69 
70 template <typename T>
71 static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
72                                   kmp_int32 schedtype, kmp_int32 *plastiter,
73                                   T *plower, T *pupper,
74                                   typename traits_t<T>::signed_t *pstride,
75                                   typename traits_t<T>::signed_t incr,
76                                   typename traits_t<T>::signed_t chunk
77 #if OMPT_SUPPORT && OMPT_OPTIONAL
78                                   ,
79                                   void *codeptr
80 #endif
81 ) {
82   KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
83   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static);
84   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static_scheduling);
85 
86   // Clear monotonic/nonmonotonic bits (ignore it)
87   schedtype = SCHEDULE_WITHOUT_MODIFIERS(schedtype);
88 
89   typedef typename traits_t<T>::unsigned_t UT;
90   typedef typename traits_t<T>::signed_t ST;
91   /*  this all has to be changed back to TID and such.. */
92   kmp_int32 gtid = global_tid;
93   kmp_uint32 tid;
94   kmp_uint32 nth;
95   UT trip_count;
96   kmp_team_t *team;
97   __kmp_assert_valid_gtid(gtid);
98   kmp_info_t *th = __kmp_threads[gtid];
99 
100 #if OMPT_SUPPORT && OMPT_OPTIONAL
101   ompt_team_info_t *team_info = NULL;
102   ompt_task_info_t *task_info = NULL;
103   ompt_work_t ompt_work_type = ompt_work_loop;
104 
105   static kmp_int8 warn = 0;
106 
107   if (ompt_enabled.ompt_callback_work || ompt_enabled.ompt_callback_dispatch) {
108     // Only fully initialize variables needed by OMPT if OMPT is enabled.
109     team_info = __ompt_get_teaminfo(0, NULL);
110     task_info = __ompt_get_task_info_object(0);
111     // Determine workshare type
112     if (loc != NULL) {
113       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
114         ompt_work_type = ompt_work_loop;
115       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
116         ompt_work_type = ompt_work_sections;
117       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
118         ompt_work_type = ompt_work_distribute;
119       } else {
120         kmp_int8 bool_res =
121             KMP_COMPARE_AND_STORE_ACQ8(&warn, (kmp_int8)0, (kmp_int8)1);
122         if (bool_res)
123           KMP_WARNING(OmptOutdatedWorkshare);
124       }
125       KMP_DEBUG_ASSERT(ompt_work_type);
126     }
127   }
128 #endif
129 
130   KMP_DEBUG_ASSERT(plastiter && plower && pupper && pstride);
131   KE_TRACE(10, ("__kmpc_for_static_init called (%d)\n", global_tid));
132 #ifdef KMP_DEBUG
133   {
134     char *buff;
135     // create format specifiers before the debug output
136     buff = __kmp_str_format(
137         "__kmpc_for_static_init: T#%%d sched=%%d liter=%%d iter=(%%%s,"
138         " %%%s, %%%s) incr=%%%s chunk=%%%s signed?<%s>\n",
139         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec,
140         traits_t<ST>::spec, traits_t<ST>::spec, traits_t<T>::spec);
141     KD_TRACE(100, (buff, global_tid, schedtype, *plastiter, *plower, *pupper,
142                    *pstride, incr, chunk));
143     __kmp_str_free(&buff);
144   }
145 #endif
146 
147   if (__kmp_env_consistency_check) {
148     __kmp_push_workshare(global_tid, ct_pdo, loc);
149     if (incr == 0) {
150       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
151                             loc);
152     }
153   }
154   /* special handling for zero-trip loops */
155   if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
156     if (plastiter != NULL)
157       *plastiter = FALSE;
158     /* leave pupper and plower set to entire iteration space */
159     *pstride = incr; /* value should never be used */
160 // *plower = *pupper - incr;
161 // let compiler bypass the illegal loop (like for(i=1;i<10;i--))
162 // THE LINE COMMENTED ABOVE CAUSED shape2F/h_tests_1.f TO HAVE A FAILURE
163 // ON A ZERO-TRIP LOOP (lower=1, upper=0,stride=1) - JPH June 23, 2009.
164 #ifdef KMP_DEBUG
165     {
166       char *buff;
167       // create format specifiers before the debug output
168       buff = __kmp_str_format("__kmpc_for_static_init:(ZERO TRIP) liter=%%d "
169                               "lower=%%%s upper=%%%s stride = %%%s "
170                               "signed?<%s>, loc = %%s\n",
171                               traits_t<T>::spec, traits_t<T>::spec,
172                               traits_t<ST>::spec, traits_t<T>::spec);
173       check_loc(loc);
174       KD_TRACE(100,
175                (buff, *plastiter, *plower, *pupper, *pstride, loc->psource));
176       __kmp_str_free(&buff);
177     }
178 #endif
179     KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
180 
181 #if OMPT_SUPPORT && OMPT_OPTIONAL
182     if (ompt_enabled.ompt_callback_work) {
183       ompt_callbacks.ompt_callback(ompt_callback_work)(
184           ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
185           &(task_info->task_data), 0, codeptr);
186     }
187 #endif
188     KMP_STATS_LOOP_END(OMP_loop_static_iterations);
189     return;
190   }
191 
192   // Although there are schedule enumerations above kmp_ord_upper which are not
193   // schedules for "distribute", the only ones which are useful are dynamic, so
194   // cannot be seen here, since this codepath is only executed for static
195   // schedules.
196   if (schedtype > kmp_ord_upper) {
197     // we are in DISTRIBUTE construct
198     schedtype += kmp_sch_static -
199                  kmp_distribute_static; // AC: convert to usual schedule type
200     if (th->th.th_team->t.t_serialized > 1) {
201       tid = 0;
202       team = th->th.th_team;
203     } else {
204       tid = th->th.th_team->t.t_master_tid;
205       team = th->th.th_team->t.t_parent;
206     }
207   } else {
208     tid = __kmp_tid_from_gtid(global_tid);
209     team = th->th.th_team;
210   }
211 
212   /* determine if "for" loop is an active worksharing construct */
213   if (team->t.t_serialized) {
214     /* serialized parallel, each thread executes whole iteration space */
215     if (plastiter != NULL)
216       *plastiter = TRUE;
217     /* leave pupper and plower set to entire iteration space */
218     *pstride =
219         (incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1));
220 
221 #ifdef KMP_DEBUG
222     {
223       char *buff;
224       // create format specifiers before the debug output
225       buff = __kmp_str_format("__kmpc_for_static_init: (serial) liter=%%d "
226                               "lower=%%%s upper=%%%s stride = %%%s\n",
227                               traits_t<T>::spec, traits_t<T>::spec,
228                               traits_t<ST>::spec);
229       KD_TRACE(100, (buff, *plastiter, *plower, *pupper, *pstride));
230       __kmp_str_free(&buff);
231     }
232 #endif
233     KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
234 
235 #if OMPT_SUPPORT && OMPT_OPTIONAL
236     if (ompt_enabled.ompt_callback_work) {
237       ompt_callbacks.ompt_callback(ompt_callback_work)(
238           ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
239           &(task_info->task_data), *pstride, codeptr);
240     }
241 #endif
242     KMP_STATS_LOOP_END(OMP_loop_static_iterations);
243     return;
244   }
245   nth = team->t.t_nproc;
246   if (nth == 1) {
247     if (plastiter != NULL)
248       *plastiter = TRUE;
249     *pstride =
250         (incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1));
251 #ifdef KMP_DEBUG
252     {
253       char *buff;
254       // create format specifiers before the debug output
255       buff = __kmp_str_format("__kmpc_for_static_init: (serial) liter=%%d "
256                               "lower=%%%s upper=%%%s stride = %%%s\n",
257                               traits_t<T>::spec, traits_t<T>::spec,
258                               traits_t<ST>::spec);
259       KD_TRACE(100, (buff, *plastiter, *plower, *pupper, *pstride));
260       __kmp_str_free(&buff);
261     }
262 #endif
263     KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
264 
265 #if OMPT_SUPPORT && OMPT_OPTIONAL
266     if (ompt_enabled.ompt_callback_work) {
267       ompt_callbacks.ompt_callback(ompt_callback_work)(
268           ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
269           &(task_info->task_data), *pstride, codeptr);
270     }
271 #endif
272     KMP_STATS_LOOP_END(OMP_loop_static_iterations);
273     return;
274   }
275 
276   /* compute trip count */
277   if (incr == 1) {
278     trip_count = *pupper - *plower + 1;
279   } else if (incr == -1) {
280     trip_count = *plower - *pupper + 1;
281   } else if (incr > 0) {
282     // upper-lower can exceed the limit of signed type
283     trip_count = (UT)(*pupper - *plower) / incr + 1;
284   } else {
285     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
286   }
287 
288 #if KMP_STATS_ENABLED
289   if (KMP_MASTER_GTID(gtid)) {
290     KMP_COUNT_VALUE(OMP_loop_static_total_iterations, trip_count);
291   }
292 #endif
293 
294   if (__kmp_env_consistency_check) {
295     /* tripcount overflow? */
296     if (trip_count == 0 && *pupper != *plower) {
297       __kmp_error_construct(kmp_i18n_msg_CnsIterationRangeTooLarge, ct_pdo,
298                             loc);
299     }
300   }
301 
302   /* compute remaining parameters */
303   switch (schedtype) {
304   case kmp_sch_static: {
305     if (trip_count < nth) {
306       KMP_DEBUG_ASSERT(
307           __kmp_static == kmp_sch_static_greedy ||
308           __kmp_static ==
309               kmp_sch_static_balanced); // Unknown static scheduling type.
310       if (tid < trip_count) {
311         *pupper = *plower = *plower + tid * incr;
312       } else {
313         // set bounds so non-active threads execute no iterations
314         *plower = *pupper + (incr > 0 ? 1 : -1);
315       }
316       if (plastiter != NULL)
317         *plastiter = (tid == trip_count - 1);
318     } else {
319       if (__kmp_static == kmp_sch_static_balanced) {
320         UT small_chunk = trip_count / nth;
321         UT extras = trip_count % nth;
322         *plower += incr * (tid * small_chunk + (tid < extras ? tid : extras));
323         *pupper = *plower + small_chunk * incr - (tid < extras ? 0 : incr);
324         if (plastiter != NULL)
325           *plastiter = (tid == nth - 1);
326       } else {
327         T big_chunk_inc_count =
328             (trip_count / nth + ((trip_count % nth) ? 1 : 0)) * incr;
329         T old_upper = *pupper;
330 
331         KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
332         // Unknown static scheduling type.
333 
334         *plower += tid * big_chunk_inc_count;
335         *pupper = *plower + big_chunk_inc_count - incr;
336         if (incr > 0) {
337           if (*pupper < *plower)
338             *pupper = traits_t<T>::max_value;
339           if (plastiter != NULL)
340             *plastiter = *plower <= old_upper && *pupper > old_upper - incr;
341           if (*pupper > old_upper)
342             *pupper = old_upper; // tracker C73258
343         } else {
344           if (*pupper > *plower)
345             *pupper = traits_t<T>::min_value;
346           if (plastiter != NULL)
347             *plastiter = *plower >= old_upper && *pupper < old_upper - incr;
348           if (*pupper < old_upper)
349             *pupper = old_upper; // tracker C73258
350         }
351       }
352     }
353     *pstride = trip_count;
354     break;
355   }
356   case kmp_sch_static_chunked: {
357     ST span;
358     UT nchunks;
359     if (chunk < 1)
360       chunk = 1;
361     else if ((UT)chunk > trip_count)
362       chunk = trip_count;
363     nchunks = (trip_count) / (UT)chunk + (trip_count % (UT)chunk ? 1 : 0);
364     span = chunk * incr;
365     if (nchunks < nth) {
366       *pstride = span * nchunks;
367       if (tid < nchunks) {
368         *plower = *plower + (span * tid);
369         *pupper = *plower + span - incr;
370       } else {
371         *plower = *pupper + (incr > 0 ? 1 : -1);
372       }
373     } else {
374       *pstride = span * nth;
375       *plower = *plower + (span * tid);
376       *pupper = *plower + span - incr;
377     }
378     if (plastiter != NULL)
379       *plastiter = (tid == (nchunks - 1) % nth);
380     break;
381   }
382   case kmp_sch_static_balanced_chunked: {
383     T old_upper = *pupper;
384     // round up to make sure the chunk is enough to cover all iterations
385     UT span = (trip_count + nth - 1) / nth;
386 
387     // perform chunk adjustment
388     chunk = (span + chunk - 1) & ~(chunk - 1);
389 
390     span = chunk * incr;
391     *plower = *plower + (span * tid);
392     *pupper = *plower + span - incr;
393     if (incr > 0) {
394       if (*pupper > old_upper)
395         *pupper = old_upper;
396     } else if (*pupper < old_upper)
397       *pupper = old_upper;
398 
399     if (plastiter != NULL)
400       *plastiter = (tid == ((trip_count - 1) / (UT)chunk));
401     break;
402   }
403   default:
404     KMP_ASSERT2(0, "__kmpc_for_static_init: unknown scheduling type");
405     break;
406   }
407 
408 #if USE_ITT_BUILD
409   // Report loop metadata
410   if (KMP_MASTER_TID(tid) && __itt_metadata_add_ptr &&
411       __kmp_forkjoin_frames_mode == 3 && th->th.th_teams_microtask == NULL &&
412       team->t.t_active_level == 1) {
413     kmp_uint64 cur_chunk = chunk;
414     check_loc(loc);
415     // Calculate chunk in case it was not specified; it is specified for
416     // kmp_sch_static_chunked
417     if (schedtype == kmp_sch_static) {
418       cur_chunk = trip_count / nth + ((trip_count % nth) ? 1 : 0);
419     }
420     // 0 - "static" schedule
421     __kmp_itt_metadata_loop(loc, 0, trip_count, cur_chunk);
422   }
423 #endif
424 #ifdef KMP_DEBUG
425   {
426     char *buff;
427     // create format specifiers before the debug output
428     buff = __kmp_str_format("__kmpc_for_static_init: liter=%%d lower=%%%s "
429                             "upper=%%%s stride = %%%s signed?<%s>\n",
430                             traits_t<T>::spec, traits_t<T>::spec,
431                             traits_t<ST>::spec, traits_t<T>::spec);
432     KD_TRACE(100, (buff, *plastiter, *plower, *pupper, *pstride));
433     __kmp_str_free(&buff);
434   }
435 #endif
436   KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
437 
438 #if OMPT_SUPPORT && OMPT_OPTIONAL
439   if (ompt_enabled.ompt_callback_work) {
440     ompt_callbacks.ompt_callback(ompt_callback_work)(
441         ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
442         &(task_info->task_data), trip_count, codeptr);
443   }
444   if (ompt_enabled.ompt_callback_dispatch) {
445     ompt_dispatch_t dispatch_type;
446     ompt_data_t instance = ompt_data_none;
447     ompt_dispatch_chunk_t dispatch_chunk;
448     if (ompt_work_type == ompt_work_sections) {
449       dispatch_type = ompt_dispatch_section;
450       instance.ptr = codeptr;
451     } else {
452       OMPT_GET_DISPATCH_CHUNK(dispatch_chunk, *plower, *pupper, incr);
453       dispatch_type = (ompt_work_type == ompt_work_distribute)
454                           ? ompt_dispatch_distribute_chunk
455                           : ompt_dispatch_ws_loop_chunk;
456       instance.ptr = &dispatch_chunk;
457     }
458     ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
459         &(team_info->parallel_data), &(task_info->task_data), dispatch_type,
460         instance);
461   }
462 #endif
463 
464   KMP_STATS_LOOP_END(OMP_loop_static_iterations);
465   return;
466 }
467 
468 template <typename T>
469 static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid,
470                                        kmp_int32 schedule, kmp_int32 *plastiter,
471                                        T *plower, T *pupper, T *pupperDist,
472                                        typename traits_t<T>::signed_t *pstride,
473                                        typename traits_t<T>::signed_t incr,
474                                        typename traits_t<T>::signed_t chunk
475 #if OMPT_SUPPORT && OMPT_OPTIONAL
476                                        ,
477                                        void *codeptr
478 #endif
479 ) {
480   KMP_COUNT_BLOCK(OMP_DISTRIBUTE);
481   KMP_PUSH_PARTITIONED_TIMER(OMP_distribute);
482   KMP_PUSH_PARTITIONED_TIMER(OMP_distribute_scheduling);
483   typedef typename traits_t<T>::unsigned_t UT;
484   typedef typename traits_t<T>::signed_t ST;
485   kmp_uint32 tid;
486   kmp_uint32 nth;
487   kmp_uint32 team_id;
488   kmp_uint32 nteams;
489   UT trip_count;
490   kmp_team_t *team;
491   kmp_info_t *th;
492 
493   KMP_DEBUG_ASSERT(plastiter && plower && pupper && pupperDist && pstride);
494   KE_TRACE(10, ("__kmpc_dist_for_static_init called (%d)\n", gtid));
495   __kmp_assert_valid_gtid(gtid);
496 #ifdef KMP_DEBUG
497   {
498     char *buff;
499     // create format specifiers before the debug output
500     buff = __kmp_str_format(
501         "__kmpc_dist_for_static_init: T#%%d schedLoop=%%d liter=%%d "
502         "iter=(%%%s, %%%s, %%%s) chunk=%%%s signed?<%s>\n",
503         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec,
504         traits_t<ST>::spec, traits_t<T>::spec);
505     KD_TRACE(100,
506              (buff, gtid, schedule, *plastiter, *plower, *pupper, incr, chunk));
507     __kmp_str_free(&buff);
508   }
509 #endif
510 
511   if (__kmp_env_consistency_check) {
512     __kmp_push_workshare(gtid, ct_pdo, loc);
513     if (incr == 0) {
514       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
515                             loc);
516     }
517     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
518       // The loop is illegal.
519       // Some zero-trip loops maintained by compiler, e.g.:
520       //   for(i=10;i<0;++i) // lower >= upper - run-time check
521       //   for(i=0;i>10;--i) // lower <= upper - run-time check
522       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
523       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
524       // Compiler does not check the following illegal loops:
525       //   for(i=0;i<10;i+=incr) // where incr<0
526       //   for(i=10;i>0;i-=incr) // where incr<0
527       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
528     }
529   }
530   tid = __kmp_tid_from_gtid(gtid);
531   th = __kmp_threads[gtid];
532   nth = th->th.th_team_nproc;
533   team = th->th.th_team;
534   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
535   nteams = th->th.th_teams_size.nteams;
536   team_id = team->t.t_master_tid;
537   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
538 
539   // compute global trip count
540   if (incr == 1) {
541     trip_count = *pupper - *plower + 1;
542   } else if (incr == -1) {
543     trip_count = *plower - *pupper + 1;
544   } else if (incr > 0) {
545     // upper-lower can exceed the limit of signed type
546     trip_count = (UT)(*pupper - *plower) / incr + 1;
547   } else {
548     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
549   }
550 
551   *pstride = *pupper - *plower; // just in case (can be unused)
552   if (trip_count <= nteams) {
553     KMP_DEBUG_ASSERT(
554         __kmp_static == kmp_sch_static_greedy ||
555         __kmp_static ==
556             kmp_sch_static_balanced); // Unknown static scheduling type.
557     // only primary threads of some teams get single iteration, other threads
558     // get nothing
559     if (team_id < trip_count && tid == 0) {
560       *pupper = *pupperDist = *plower = *plower + team_id * incr;
561     } else {
562       *pupperDist = *pupper;
563       *plower = *pupper + incr; // compiler should skip loop body
564     }
565     if (plastiter != NULL)
566       *plastiter = (tid == 0 && team_id == trip_count - 1);
567   } else {
568     // Get the team's chunk first (each team gets at most one chunk)
569     if (__kmp_static == kmp_sch_static_balanced) {
570       UT chunkD = trip_count / nteams;
571       UT extras = trip_count % nteams;
572       *plower +=
573           incr * (team_id * chunkD + (team_id < extras ? team_id : extras));
574       *pupperDist = *plower + chunkD * incr - (team_id < extras ? 0 : incr);
575       if (plastiter != NULL)
576         *plastiter = (team_id == nteams - 1);
577     } else {
578       T chunk_inc_count =
579           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
580       T upper = *pupper;
581       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
582       // Unknown static scheduling type.
583       *plower += team_id * chunk_inc_count;
584       *pupperDist = *plower + chunk_inc_count - incr;
585       // Check/correct bounds if needed
586       if (incr > 0) {
587         if (*pupperDist < *plower)
588           *pupperDist = traits_t<T>::max_value;
589         if (plastiter != NULL)
590           *plastiter = *plower <= upper && *pupperDist > upper - incr;
591         if (*pupperDist > upper)
592           *pupperDist = upper; // tracker C73258
593         if (*plower > *pupperDist) {
594           *pupper = *pupperDist; // no iterations available for the team
595           goto end;
596         }
597       } else {
598         if (*pupperDist > *plower)
599           *pupperDist = traits_t<T>::min_value;
600         if (plastiter != NULL)
601           *plastiter = *plower >= upper && *pupperDist < upper - incr;
602         if (*pupperDist < upper)
603           *pupperDist = upper; // tracker C73258
604         if (*plower < *pupperDist) {
605           *pupper = *pupperDist; // no iterations available for the team
606           goto end;
607         }
608       }
609     }
610     // Get the parallel loop chunk now (for thread)
611     // compute trip count for team's chunk
612     if (incr == 1) {
613       trip_count = *pupperDist - *plower + 1;
614     } else if (incr == -1) {
615       trip_count = *plower - *pupperDist + 1;
616     } else if (incr > 1) {
617       // upper-lower can exceed the limit of signed type
618       trip_count = (UT)(*pupperDist - *plower) / incr + 1;
619     } else {
620       trip_count = (UT)(*plower - *pupperDist) / (-incr) + 1;
621     }
622     KMP_DEBUG_ASSERT(trip_count);
623     switch (schedule) {
624     case kmp_sch_static: {
625       if (trip_count <= nth) {
626         KMP_DEBUG_ASSERT(
627             __kmp_static == kmp_sch_static_greedy ||
628             __kmp_static ==
629                 kmp_sch_static_balanced); // Unknown static scheduling type.
630         if (tid < trip_count)
631           *pupper = *plower = *plower + tid * incr;
632         else
633           *plower = *pupper + incr; // no iterations available
634         if (plastiter != NULL)
635           if (*plastiter != 0 && !(tid == trip_count - 1))
636             *plastiter = 0;
637       } else {
638         if (__kmp_static == kmp_sch_static_balanced) {
639           UT chunkL = trip_count / nth;
640           UT extras = trip_count % nth;
641           *plower += incr * (tid * chunkL + (tid < extras ? tid : extras));
642           *pupper = *plower + chunkL * incr - (tid < extras ? 0 : incr);
643           if (plastiter != NULL)
644             if (*plastiter != 0 && !(tid == nth - 1))
645               *plastiter = 0;
646         } else {
647           T chunk_inc_count =
648               (trip_count / nth + ((trip_count % nth) ? 1 : 0)) * incr;
649           T upper = *pupperDist;
650           KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
651           // Unknown static scheduling type.
652           *plower += tid * chunk_inc_count;
653           *pupper = *plower + chunk_inc_count - incr;
654           if (incr > 0) {
655             if (*pupper < *plower)
656               *pupper = traits_t<T>::max_value;
657             if (plastiter != NULL)
658               if (*plastiter != 0 &&
659                   !(*plower <= upper && *pupper > upper - incr))
660                 *plastiter = 0;
661             if (*pupper > upper)
662               *pupper = upper; // tracker C73258
663           } else {
664             if (*pupper > *plower)
665               *pupper = traits_t<T>::min_value;
666             if (plastiter != NULL)
667               if (*plastiter != 0 &&
668                   !(*plower >= upper && *pupper < upper - incr))
669                 *plastiter = 0;
670             if (*pupper < upper)
671               *pupper = upper; // tracker C73258
672           }
673         }
674       }
675       break;
676     }
677     case kmp_sch_static_chunked: {
678       ST span;
679       if (chunk < 1)
680         chunk = 1;
681       span = chunk * incr;
682       *pstride = span * nth;
683       *plower = *plower + (span * tid);
684       *pupper = *plower + span - incr;
685       if (plastiter != NULL)
686         if (*plastiter != 0 && !(tid == ((trip_count - 1) / (UT)chunk) % nth))
687           *plastiter = 0;
688       break;
689     }
690     default:
691       KMP_ASSERT2(0,
692                   "__kmpc_dist_for_static_init: unknown loop scheduling type");
693       break;
694     }
695   }
696 end:;
697 #ifdef KMP_DEBUG
698   {
699     char *buff;
700     // create format specifiers before the debug output
701     buff = __kmp_str_format(
702         "__kmpc_dist_for_static_init: last=%%d lo=%%%s up=%%%s upDist=%%%s "
703         "stride=%%%s signed?<%s>\n",
704         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec,
705         traits_t<ST>::spec, traits_t<T>::spec);
706     KD_TRACE(100, (buff, *plastiter, *plower, *pupper, *pupperDist, *pstride));
707     __kmp_str_free(&buff);
708   }
709 #endif
710   KE_TRACE(10, ("__kmpc_dist_for_static_init: T#%d return\n", gtid));
711 #if OMPT_SUPPORT && OMPT_OPTIONAL
712   if (ompt_enabled.ompt_callback_work || ompt_enabled.ompt_callback_dispatch) {
713     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
714     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
715     if (ompt_enabled.ompt_callback_work) {
716       ompt_callbacks.ompt_callback(ompt_callback_work)(
717           ompt_work_distribute, ompt_scope_begin, &(team_info->parallel_data),
718           &(task_info->task_data), 0, codeptr);
719     }
720     if (ompt_enabled.ompt_callback_dispatch) {
721       ompt_data_t instance = ompt_data_none;
722       ompt_dispatch_chunk_t dispatch_chunk;
723       OMPT_GET_DISPATCH_CHUNK(dispatch_chunk, *plower, *pupperDist, incr);
724       instance.ptr = &dispatch_chunk;
725       ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
726           &(team_info->parallel_data), &(task_info->task_data),
727           ompt_dispatch_distribute_chunk, instance);
728     }
729   }
730 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
731   KMP_STATS_LOOP_END(OMP_distribute_iterations);
732   return;
733 }
734 
735 template <typename T>
736 static void __kmp_team_static_init(ident_t *loc, kmp_int32 gtid,
737                                    kmp_int32 *p_last, T *p_lb, T *p_ub,
738                                    typename traits_t<T>::signed_t *p_st,
739                                    typename traits_t<T>::signed_t incr,
740                                    typename traits_t<T>::signed_t chunk) {
741   // The routine returns the first chunk distributed to the team and
742   // stride for next chunks calculation.
743   // Last iteration flag set for the team that will execute
744   // the last iteration of the loop.
745   // The routine is called for dist_schedule(static,chunk) only.
746   typedef typename traits_t<T>::unsigned_t UT;
747   typedef typename traits_t<T>::signed_t ST;
748   kmp_uint32 team_id;
749   kmp_uint32 nteams;
750   UT trip_count;
751   T lower;
752   T upper;
753   ST span;
754   kmp_team_t *team;
755   kmp_info_t *th;
756 
757   KMP_DEBUG_ASSERT(p_last && p_lb && p_ub && p_st);
758   KE_TRACE(10, ("__kmp_team_static_init called (%d)\n", gtid));
759   __kmp_assert_valid_gtid(gtid);
760 #ifdef KMP_DEBUG
761   {
762     char *buff;
763     // create format specifiers before the debug output
764     buff = __kmp_str_format("__kmp_team_static_init enter: T#%%d liter=%%d "
765                             "iter=(%%%s, %%%s, %%%s) chunk %%%s; signed?<%s>\n",
766                             traits_t<T>::spec, traits_t<T>::spec,
767                             traits_t<ST>::spec, traits_t<ST>::spec,
768                             traits_t<T>::spec);
769     KD_TRACE(100, (buff, gtid, *p_last, *p_lb, *p_ub, *p_st, chunk));
770     __kmp_str_free(&buff);
771   }
772 #endif
773 
774   lower = *p_lb;
775   upper = *p_ub;
776   if (__kmp_env_consistency_check) {
777     if (incr == 0) {
778       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
779                             loc);
780     }
781     if (incr > 0 ? (upper < lower) : (lower < upper)) {
782       // The loop is illegal.
783       // Some zero-trip loops maintained by compiler, e.g.:
784       //   for(i=10;i<0;++i) // lower >= upper - run-time check
785       //   for(i=0;i>10;--i) // lower <= upper - run-time check
786       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
787       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
788       // Compiler does not check the following illegal loops:
789       //   for(i=0;i<10;i+=incr) // where incr<0
790       //   for(i=10;i>0;i-=incr) // where incr<0
791       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
792     }
793   }
794   th = __kmp_threads[gtid];
795   team = th->th.th_team;
796   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
797   nteams = th->th.th_teams_size.nteams;
798   team_id = team->t.t_master_tid;
799   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
800 
801   // compute trip count
802   if (incr == 1) {
803     trip_count = upper - lower + 1;
804   } else if (incr == -1) {
805     trip_count = lower - upper + 1;
806   } else if (incr > 0) {
807     // upper-lower can exceed the limit of signed type
808     trip_count = (UT)(upper - lower) / incr + 1;
809   } else {
810     trip_count = (UT)(lower - upper) / (-incr) + 1;
811   }
812   if (chunk < 1)
813     chunk = 1;
814   span = chunk * incr;
815   *p_st = span * nteams;
816   *p_lb = lower + (span * team_id);
817   *p_ub = *p_lb + span - incr;
818   if (p_last != NULL)
819     *p_last = (team_id == ((trip_count - 1) / (UT)chunk) % nteams);
820   // Correct upper bound if needed
821   if (incr > 0) {
822     if (*p_ub < *p_lb) // overflow?
823       *p_ub = traits_t<T>::max_value;
824     if (*p_ub > upper)
825       *p_ub = upper; // tracker C73258
826   } else { // incr < 0
827     if (*p_ub > *p_lb)
828       *p_ub = traits_t<T>::min_value;
829     if (*p_ub < upper)
830       *p_ub = upper; // tracker C73258
831   }
832 #ifdef KMP_DEBUG
833   {
834     char *buff;
835     // create format specifiers before the debug output
836     buff =
837         __kmp_str_format("__kmp_team_static_init exit: T#%%d team%%u liter=%%d "
838                          "iter=(%%%s, %%%s, %%%s) chunk %%%s\n",
839                          traits_t<T>::spec, traits_t<T>::spec,
840                          traits_t<ST>::spec, traits_t<ST>::spec);
841     KD_TRACE(100, (buff, gtid, team_id, *p_last, *p_lb, *p_ub, *p_st, chunk));
842     __kmp_str_free(&buff);
843   }
844 #endif
845 }
846 
847 //------------------------------------------------------------------------------
848 extern "C" {
849 /*!
850 @ingroup WORK_SHARING
851 @param    loc       Source code location
852 @param    gtid      Global thread id of this thread
853 @param    schedtype  Scheduling type
854 @param    plastiter Pointer to the "last iteration" flag
855 @param    plower    Pointer to the lower bound
856 @param    pupper    Pointer to the upper bound
857 @param    pstride   Pointer to the stride
858 @param    incr      Loop increment
859 @param    chunk     The chunk size
860 
861 Each of the four functions here are identical apart from the argument types.
862 
863 The functions compute the upper and lower bounds and stride to be used for the
864 set of iterations to be executed by the current thread from the statically
865 scheduled loop that is described by the initial values of the bounds, stride,
866 increment and chunk size.
867 
868 @{
869 */
870 void __kmpc_for_static_init_4(ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype,
871                               kmp_int32 *plastiter, kmp_int32 *plower,
872                               kmp_int32 *pupper, kmp_int32 *pstride,
873                               kmp_int32 incr, kmp_int32 chunk) {
874   __kmp_for_static_init<kmp_int32>(loc, gtid, schedtype, plastiter, plower,
875                                    pupper, pstride, incr, chunk
876 #if OMPT_SUPPORT && OMPT_OPTIONAL
877                                    ,
878                                    OMPT_GET_RETURN_ADDRESS(0)
879 #endif
880   );
881 }
882 
883 /*!
884  See @ref __kmpc_for_static_init_4
885  */
886 void __kmpc_for_static_init_4u(ident_t *loc, kmp_int32 gtid,
887                                kmp_int32 schedtype, kmp_int32 *plastiter,
888                                kmp_uint32 *plower, kmp_uint32 *pupper,
889                                kmp_int32 *pstride, kmp_int32 incr,
890                                kmp_int32 chunk) {
891   __kmp_for_static_init<kmp_uint32>(loc, gtid, schedtype, plastiter, plower,
892                                     pupper, pstride, incr, chunk
893 #if OMPT_SUPPORT && OMPT_OPTIONAL
894                                     ,
895                                     OMPT_GET_RETURN_ADDRESS(0)
896 #endif
897   );
898 }
899 
900 /*!
901  See @ref __kmpc_for_static_init_4
902  */
903 void __kmpc_for_static_init_8(ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype,
904                               kmp_int32 *plastiter, kmp_int64 *plower,
905                               kmp_int64 *pupper, kmp_int64 *pstride,
906                               kmp_int64 incr, kmp_int64 chunk) {
907   __kmp_for_static_init<kmp_int64>(loc, gtid, schedtype, plastiter, plower,
908                                    pupper, pstride, incr, chunk
909 #if OMPT_SUPPORT && OMPT_OPTIONAL
910                                    ,
911                                    OMPT_GET_RETURN_ADDRESS(0)
912 #endif
913   );
914 }
915 
916 /*!
917  See @ref __kmpc_for_static_init_4
918  */
919 void __kmpc_for_static_init_8u(ident_t *loc, kmp_int32 gtid,
920                                kmp_int32 schedtype, kmp_int32 *plastiter,
921                                kmp_uint64 *plower, kmp_uint64 *pupper,
922                                kmp_int64 *pstride, kmp_int64 incr,
923                                kmp_int64 chunk) {
924   __kmp_for_static_init<kmp_uint64>(loc, gtid, schedtype, plastiter, plower,
925                                     pupper, pstride, incr, chunk
926 #if OMPT_SUPPORT && OMPT_OPTIONAL
927                                     ,
928                                     OMPT_GET_RETURN_ADDRESS(0)
929 #endif
930   );
931 }
932 /*!
933 @}
934 */
935 
936 #if OMPT_SUPPORT && OMPT_OPTIONAL
937 #define OMPT_CODEPTR_ARG , OMPT_GET_RETURN_ADDRESS(0)
938 #else
939 #define OMPT_CODEPTR_ARG
940 #endif
941 
942 /*!
943 @ingroup WORK_SHARING
944 @param    loc       Source code location
945 @param    gtid      Global thread id of this thread
946 @param    schedule  Scheduling type for the parallel loop
947 @param    plastiter Pointer to the "last iteration" flag
948 @param    plower    Pointer to the lower bound
949 @param    pupper    Pointer to the upper bound of loop chunk
950 @param    pupperD   Pointer to the upper bound of dist_chunk
951 @param    pstride   Pointer to the stride for parallel loop
952 @param    incr      Loop increment
953 @param    chunk     The chunk size for the parallel loop
954 
955 Each of the four functions here are identical apart from the argument types.
956 
957 The functions compute the upper and lower bounds and strides to be used for the
958 set of iterations to be executed by the current thread from the statically
959 scheduled loop that is described by the initial values of the bounds, strides,
960 increment and chunks for parallel loop and distribute constructs.
961 
962 @{
963 */
964 void __kmpc_dist_for_static_init_4(ident_t *loc, kmp_int32 gtid,
965                                    kmp_int32 schedule, kmp_int32 *plastiter,
966                                    kmp_int32 *plower, kmp_int32 *pupper,
967                                    kmp_int32 *pupperD, kmp_int32 *pstride,
968                                    kmp_int32 incr, kmp_int32 chunk) {
969   __kmp_dist_for_static_init<kmp_int32>(loc, gtid, schedule, plastiter, plower,
970                                         pupper, pupperD, pstride, incr,
971                                         chunk OMPT_CODEPTR_ARG);
972 }
973 
974 /*!
975  See @ref __kmpc_dist_for_static_init_4
976  */
977 void __kmpc_dist_for_static_init_4u(ident_t *loc, kmp_int32 gtid,
978                                     kmp_int32 schedule, kmp_int32 *plastiter,
979                                     kmp_uint32 *plower, kmp_uint32 *pupper,
980                                     kmp_uint32 *pupperD, kmp_int32 *pstride,
981                                     kmp_int32 incr, kmp_int32 chunk) {
982   __kmp_dist_for_static_init<kmp_uint32>(loc, gtid, schedule, plastiter, plower,
983                                          pupper, pupperD, pstride, incr,
984                                          chunk OMPT_CODEPTR_ARG);
985 }
986 
987 /*!
988  See @ref __kmpc_dist_for_static_init_4
989  */
990 void __kmpc_dist_for_static_init_8(ident_t *loc, kmp_int32 gtid,
991                                    kmp_int32 schedule, kmp_int32 *plastiter,
992                                    kmp_int64 *plower, kmp_int64 *pupper,
993                                    kmp_int64 *pupperD, kmp_int64 *pstride,
994                                    kmp_int64 incr, kmp_int64 chunk) {
995   __kmp_dist_for_static_init<kmp_int64>(loc, gtid, schedule, plastiter, plower,
996                                         pupper, pupperD, pstride, incr,
997                                         chunk OMPT_CODEPTR_ARG);
998 }
999 
1000 /*!
1001  See @ref __kmpc_dist_for_static_init_4
1002  */
1003 void __kmpc_dist_for_static_init_8u(ident_t *loc, kmp_int32 gtid,
1004                                     kmp_int32 schedule, kmp_int32 *plastiter,
1005                                     kmp_uint64 *plower, kmp_uint64 *pupper,
1006                                     kmp_uint64 *pupperD, kmp_int64 *pstride,
1007                                     kmp_int64 incr, kmp_int64 chunk) {
1008   __kmp_dist_for_static_init<kmp_uint64>(loc, gtid, schedule, plastiter, plower,
1009                                          pupper, pupperD, pstride, incr,
1010                                          chunk OMPT_CODEPTR_ARG);
1011 }
1012 /*!
1013 @}
1014 */
1015 
1016 //------------------------------------------------------------------------------
1017 // Auxiliary routines for Distribute Parallel Loop construct implementation
1018 //    Transfer call to template< type T >
1019 //    __kmp_team_static_init( ident_t *loc, int gtid,
1020 //        int *p_last, T *lb, T *ub, ST *st, ST incr, ST chunk )
1021 
1022 /*!
1023 @ingroup WORK_SHARING
1024 @{
1025 @param loc Source location
1026 @param gtid Global thread id
1027 @param p_last pointer to last iteration flag
1028 @param p_lb  pointer to Lower bound
1029 @param p_ub  pointer to Upper bound
1030 @param p_st  Step (or increment if you prefer)
1031 @param incr  Loop increment
1032 @param chunk The chunk size to block with
1033 
1034 The functions compute the upper and lower bounds and stride to be used for the
1035 set of iterations to be executed by the current team from the statically
1036 scheduled loop that is described by the initial values of the bounds, stride,
1037 increment and chunk for the distribute construct as part of composite distribute
1038 parallel loop construct. These functions are all identical apart from the types
1039 of the arguments.
1040 */
1041 
1042 void __kmpc_team_static_init_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
1043                                kmp_int32 *p_lb, kmp_int32 *p_ub,
1044                                kmp_int32 *p_st, kmp_int32 incr,
1045                                kmp_int32 chunk) {
1046   KMP_DEBUG_ASSERT(__kmp_init_serial);
1047   __kmp_team_static_init<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st, incr,
1048                                     chunk);
1049 }
1050 
1051 /*!
1052  See @ref __kmpc_team_static_init_4
1053  */
1054 void __kmpc_team_static_init_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
1055                                 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
1056                                 kmp_int32 *p_st, kmp_int32 incr,
1057                                 kmp_int32 chunk) {
1058   KMP_DEBUG_ASSERT(__kmp_init_serial);
1059   __kmp_team_static_init<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st, incr,
1060                                      chunk);
1061 }
1062 
1063 /*!
1064  See @ref __kmpc_team_static_init_4
1065  */
1066 void __kmpc_team_static_init_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
1067                                kmp_int64 *p_lb, kmp_int64 *p_ub,
1068                                kmp_int64 *p_st, kmp_int64 incr,
1069                                kmp_int64 chunk) {
1070   KMP_DEBUG_ASSERT(__kmp_init_serial);
1071   __kmp_team_static_init<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st, incr,
1072                                     chunk);
1073 }
1074 
1075 /*!
1076  See @ref __kmpc_team_static_init_4
1077  */
1078 void __kmpc_team_static_init_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
1079                                 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
1080                                 kmp_int64 *p_st, kmp_int64 incr,
1081                                 kmp_int64 chunk) {
1082   KMP_DEBUG_ASSERT(__kmp_init_serial);
1083   __kmp_team_static_init<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st, incr,
1084                                      chunk);
1085 }
1086 /*!
1087 @}
1088 */
1089 
1090 } // extern "C"
1091