xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_sched.cpp (revision e1c4c8dd8d2d10b6104f06856a77bd5b4813a801)
1 /*
2  * kmp_sched.cpp -- static scheduling -- iteration initialization
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Static scheduling initialization.
14 
15   NOTE: team->t.t_nproc is a constant inside of any dispatch loop, however
16         it may change values between parallel regions.  __kmp_max_nth
17         is the largest value __kmp_nth may take, 1 is the smallest. */
18 
19 #include "kmp.h"
20 #include "kmp_error.h"
21 #include "kmp_i18n.h"
22 #include "kmp_itt.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 
26 #if OMPT_SUPPORT
27 #include "ompt-specific.h"
28 #endif
29 
30 #ifdef KMP_DEBUG
31 //-------------------------------------------------------------------------
32 // template for debug prints specification ( d, u, lld, llu )
33 char const *traits_t<int>::spec = "d";
34 char const *traits_t<unsigned int>::spec = "u";
35 char const *traits_t<long long>::spec = "lld";
36 char const *traits_t<unsigned long long>::spec = "llu";
37 char const *traits_t<long>::spec = "ld";
38 //-------------------------------------------------------------------------
39 #endif
40 
41 #if KMP_STATS_ENABLED
42 #define KMP_STATS_LOOP_END(stat)                                               \
43   {                                                                            \
44     kmp_int64 t;                                                               \
45     kmp_int64 u = (kmp_int64)(*pupper);                                        \
46     kmp_int64 l = (kmp_int64)(*plower);                                        \
47     kmp_int64 i = (kmp_int64)incr;                                             \
48     if (i == 1) {                                                              \
49       t = u - l + 1;                                                           \
50     } else if (i == -1) {                                                      \
51       t = l - u + 1;                                                           \
52     } else if (i > 0) {                                                        \
53       t = (u - l) / i + 1;                                                     \
54     } else {                                                                   \
55       t = (l - u) / (-i) + 1;                                                  \
56     }                                                                          \
57     KMP_COUNT_VALUE(stat, t);                                                  \
58     KMP_POP_PARTITIONED_TIMER();                                               \
59   }
60 #else
61 #define KMP_STATS_LOOP_END(stat) /* Nothing */
62 #endif
63 
64 #if USE_ITT_BUILD || defined KMP_DEBUG
65 static ident_t loc_stub = {0, KMP_IDENT_KMPC, 0, 0, ";unknown;unknown;0;0;;"};
66 static inline void check_loc(ident_t *&loc) {
67   if (loc == NULL)
68     loc = &loc_stub; // may need to report location info to ittnotify
69 }
70 #endif
71 
72 template <typename T>
73 static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
74                                   kmp_int32 schedtype, kmp_int32 *plastiter,
75                                   T *plower, T *pupper,
76                                   typename traits_t<T>::signed_t *pstride,
77                                   typename traits_t<T>::signed_t incr,
78                                   typename traits_t<T>::signed_t chunk
79 #if OMPT_SUPPORT && OMPT_OPTIONAL
80                                   ,
81                                   void *codeptr
82 #endif
83 ) {
84   KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
85   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static);
86   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static_scheduling);
87 
88   // Clear monotonic/nonmonotonic bits (ignore it)
89   schedtype = SCHEDULE_WITHOUT_MODIFIERS(schedtype);
90 
91   typedef typename traits_t<T>::unsigned_t UT;
92   typedef typename traits_t<T>::signed_t ST;
93   /*  this all has to be changed back to TID and such.. */
94   kmp_int32 gtid = global_tid;
95   kmp_uint32 tid;
96   kmp_uint32 nth;
97   UT trip_count;
98   kmp_team_t *team;
99   __kmp_assert_valid_gtid(gtid);
100   kmp_info_t *th = __kmp_threads[gtid];
101 
102 #if OMPT_SUPPORT && OMPT_OPTIONAL
103   ompt_team_info_t *team_info = NULL;
104   ompt_task_info_t *task_info = NULL;
105   ompt_work_t ompt_work_type = ompt_work_loop;
106 
107   static kmp_int8 warn = 0;
108 
109   if (ompt_enabled.ompt_callback_work || ompt_enabled.ompt_callback_dispatch) {
110     // Only fully initialize variables needed by OMPT if OMPT is enabled.
111     team_info = __ompt_get_teaminfo(0, NULL);
112     task_info = __ompt_get_task_info_object(0);
113     // Determine workshare type
114     if (loc != NULL) {
115       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
116         ompt_work_type = ompt_work_loop;
117       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
118         ompt_work_type = ompt_work_sections;
119       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
120         ompt_work_type = ompt_work_distribute;
121       } else {
122         kmp_int8 bool_res =
123             KMP_COMPARE_AND_STORE_ACQ8(&warn, (kmp_int8)0, (kmp_int8)1);
124         if (bool_res)
125           KMP_WARNING(OmptOutdatedWorkshare);
126       }
127       KMP_DEBUG_ASSERT(ompt_work_type);
128     }
129   }
130 #endif
131 
132   KMP_DEBUG_ASSERT(plastiter && plower && pupper && pstride);
133   KE_TRACE(10, ("__kmpc_for_static_init called (%d)\n", global_tid));
134 #ifdef KMP_DEBUG
135   {
136     char *buff;
137     // create format specifiers before the debug output
138     buff = __kmp_str_format(
139         "__kmpc_for_static_init: T#%%d sched=%%d liter=%%d iter=(%%%s,"
140         " %%%s, %%%s) incr=%%%s chunk=%%%s signed?<%s>\n",
141         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec,
142         traits_t<ST>::spec, traits_t<ST>::spec, traits_t<T>::spec);
143     KD_TRACE(100, (buff, global_tid, schedtype, *plastiter, *plower, *pupper,
144                    *pstride, incr, chunk));
145     __kmp_str_free(&buff);
146   }
147 #endif
148 
149   if (__kmp_env_consistency_check) {
150     __kmp_push_workshare(global_tid, ct_pdo, loc);
151     if (incr == 0) {
152       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
153                             loc);
154     }
155   }
156   /* special handling for zero-trip loops */
157   if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
158     if (plastiter != NULL)
159       *plastiter = FALSE;
160     /* leave pupper and plower set to entire iteration space */
161     *pstride = incr; /* value should never be used */
162 // *plower = *pupper - incr;
163 // let compiler bypass the illegal loop (like for(i=1;i<10;i--))
164 // THE LINE COMMENTED ABOVE CAUSED shape2F/h_tests_1.f TO HAVE A FAILURE
165 // ON A ZERO-TRIP LOOP (lower=1, upper=0,stride=1) - JPH June 23, 2009.
166 #ifdef KMP_DEBUG
167     {
168       char *buff;
169       // create format specifiers before the debug output
170       buff = __kmp_str_format("__kmpc_for_static_init:(ZERO TRIP) liter=%%d "
171                               "lower=%%%s upper=%%%s stride = %%%s "
172                               "signed?<%s>, loc = %%s\n",
173                               traits_t<T>::spec, traits_t<T>::spec,
174                               traits_t<ST>::spec, traits_t<T>::spec);
175       check_loc(loc);
176       KD_TRACE(100,
177                (buff, *plastiter, *plower, *pupper, *pstride, loc->psource));
178       __kmp_str_free(&buff);
179     }
180 #endif
181     KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
182 
183 #if OMPT_SUPPORT && OMPT_OPTIONAL
184     if (ompt_enabled.ompt_callback_work) {
185       ompt_callbacks.ompt_callback(ompt_callback_work)(
186           ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
187           &(task_info->task_data), 0, codeptr);
188     }
189 #endif
190     KMP_STATS_LOOP_END(OMP_loop_static_iterations);
191     return;
192   }
193 
194   // Although there are schedule enumerations above kmp_ord_upper which are not
195   // schedules for "distribute", the only ones which are useful are dynamic, so
196   // cannot be seen here, since this codepath is only executed for static
197   // schedules.
198   if (schedtype > kmp_ord_upper) {
199     // we are in DISTRIBUTE construct
200     schedtype += kmp_sch_static -
201                  kmp_distribute_static; // AC: convert to usual schedule type
202     if (th->th.th_team->t.t_serialized > 1) {
203       tid = 0;
204       team = th->th.th_team;
205     } else {
206       tid = th->th.th_team->t.t_master_tid;
207       team = th->th.th_team->t.t_parent;
208     }
209   } else {
210     tid = __kmp_tid_from_gtid(global_tid);
211     team = th->th.th_team;
212   }
213 
214   /* determine if "for" loop is an active worksharing construct */
215   if (team->t.t_serialized) {
216     /* serialized parallel, each thread executes whole iteration space */
217     if (plastiter != NULL)
218       *plastiter = TRUE;
219     /* leave pupper and plower set to entire iteration space */
220     *pstride =
221         (incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1));
222 
223 #ifdef KMP_DEBUG
224     {
225       char *buff;
226       // create format specifiers before the debug output
227       buff = __kmp_str_format("__kmpc_for_static_init: (serial) liter=%%d "
228                               "lower=%%%s upper=%%%s stride = %%%s\n",
229                               traits_t<T>::spec, traits_t<T>::spec,
230                               traits_t<ST>::spec);
231       KD_TRACE(100, (buff, *plastiter, *plower, *pupper, *pstride));
232       __kmp_str_free(&buff);
233     }
234 #endif
235     KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
236 
237 #if OMPT_SUPPORT && OMPT_OPTIONAL
238     if (ompt_enabled.ompt_callback_work) {
239       ompt_callbacks.ompt_callback(ompt_callback_work)(
240           ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
241           &(task_info->task_data), *pstride, codeptr);
242     }
243 #endif
244     KMP_STATS_LOOP_END(OMP_loop_static_iterations);
245     return;
246   }
247   nth = team->t.t_nproc;
248   if (nth == 1) {
249     if (plastiter != NULL)
250       *plastiter = TRUE;
251     *pstride =
252         (incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1));
253 #ifdef KMP_DEBUG
254     {
255       char *buff;
256       // create format specifiers before the debug output
257       buff = __kmp_str_format("__kmpc_for_static_init: (serial) liter=%%d "
258                               "lower=%%%s upper=%%%s stride = %%%s\n",
259                               traits_t<T>::spec, traits_t<T>::spec,
260                               traits_t<ST>::spec);
261       KD_TRACE(100, (buff, *plastiter, *plower, *pupper, *pstride));
262       __kmp_str_free(&buff);
263     }
264 #endif
265     KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
266 
267 #if OMPT_SUPPORT && OMPT_OPTIONAL
268     if (ompt_enabled.ompt_callback_work) {
269       ompt_callbacks.ompt_callback(ompt_callback_work)(
270           ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
271           &(task_info->task_data), *pstride, codeptr);
272     }
273 #endif
274     KMP_STATS_LOOP_END(OMP_loop_static_iterations);
275     return;
276   }
277 
278   /* compute trip count */
279   if (incr == 1) {
280     trip_count = *pupper - *plower + 1;
281   } else if (incr == -1) {
282     trip_count = *plower - *pupper + 1;
283   } else if (incr > 0) {
284     // upper-lower can exceed the limit of signed type
285     trip_count = (UT)(*pupper - *plower) / incr + 1;
286   } else {
287     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
288   }
289 
290 #if KMP_STATS_ENABLED
291   if (KMP_MASTER_GTID(gtid)) {
292     KMP_COUNT_VALUE(OMP_loop_static_total_iterations, trip_count);
293   }
294 #endif
295 
296   if (__kmp_env_consistency_check) {
297     /* tripcount overflow? */
298     if (trip_count == 0 && *pupper != *plower) {
299       __kmp_error_construct(kmp_i18n_msg_CnsIterationRangeTooLarge, ct_pdo,
300                             loc);
301     }
302   }
303 
304   /* compute remaining parameters */
305   switch (schedtype) {
306   case kmp_sch_static: {
307     if (trip_count < nth) {
308       KMP_DEBUG_ASSERT(
309           __kmp_static == kmp_sch_static_greedy ||
310           __kmp_static ==
311               kmp_sch_static_balanced); // Unknown static scheduling type.
312       if (tid < trip_count) {
313         *pupper = *plower = *plower + tid * incr;
314       } else {
315         // set bounds so non-active threads execute no iterations
316         *plower = *pupper + (incr > 0 ? 1 : -1);
317       }
318       if (plastiter != NULL)
319         *plastiter = (tid == trip_count - 1);
320     } else {
321       if (__kmp_static == kmp_sch_static_balanced) {
322         UT small_chunk = trip_count / nth;
323         UT extras = trip_count % nth;
324         *plower += incr * (tid * small_chunk + (tid < extras ? tid : extras));
325         *pupper = *plower + small_chunk * incr - (tid < extras ? 0 : incr);
326         if (plastiter != NULL)
327           *plastiter = (tid == nth - 1);
328       } else {
329         T big_chunk_inc_count =
330             (trip_count / nth + ((trip_count % nth) ? 1 : 0)) * incr;
331         T old_upper = *pupper;
332 
333         KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
334         // Unknown static scheduling type.
335 
336         *plower += tid * big_chunk_inc_count;
337         *pupper = *plower + big_chunk_inc_count - incr;
338         if (incr > 0) {
339           if (*pupper < *plower)
340             *pupper = traits_t<T>::max_value;
341           if (plastiter != NULL)
342             *plastiter = *plower <= old_upper && *pupper > old_upper - incr;
343           if (*pupper > old_upper)
344             *pupper = old_upper; // tracker C73258
345         } else {
346           if (*pupper > *plower)
347             *pupper = traits_t<T>::min_value;
348           if (plastiter != NULL)
349             *plastiter = *plower >= old_upper && *pupper < old_upper - incr;
350           if (*pupper < old_upper)
351             *pupper = old_upper; // tracker C73258
352         }
353       }
354     }
355     *pstride = trip_count;
356     break;
357   }
358   case kmp_sch_static_chunked: {
359     ST span;
360     UT nchunks;
361     if (chunk < 1)
362       chunk = 1;
363     else if ((UT)chunk > trip_count)
364       chunk = trip_count;
365     nchunks = (trip_count) / (UT)chunk + (trip_count % (UT)chunk ? 1 : 0);
366     span = chunk * incr;
367     if (nchunks < nth) {
368       *pstride = span * nchunks;
369       if (tid < nchunks) {
370         *plower = *plower + (span * tid);
371         *pupper = *plower + span - incr;
372       } else {
373         *plower = *pupper + (incr > 0 ? 1 : -1);
374       }
375     } else {
376       *pstride = span * nth;
377       *plower = *plower + (span * tid);
378       *pupper = *plower + span - incr;
379     }
380     if (plastiter != NULL)
381       *plastiter = (tid == (nchunks - 1) % nth);
382     break;
383   }
384   case kmp_sch_static_balanced_chunked: {
385     T old_upper = *pupper;
386     // round up to make sure the chunk is enough to cover all iterations
387     UT span = (trip_count + nth - 1) / nth;
388 
389     // perform chunk adjustment
390     chunk = (span + chunk - 1) & ~(chunk - 1);
391 
392     span = chunk * incr;
393     *plower = *plower + (span * tid);
394     *pupper = *plower + span - incr;
395     if (incr > 0) {
396       if (*pupper > old_upper)
397         *pupper = old_upper;
398     } else if (*pupper < old_upper)
399       *pupper = old_upper;
400 
401     if (plastiter != NULL)
402       *plastiter = (tid == ((trip_count - 1) / (UT)chunk));
403     break;
404   }
405   default:
406     KMP_ASSERT2(0, "__kmpc_for_static_init: unknown scheduling type");
407     break;
408   }
409 
410 #if USE_ITT_BUILD
411   // Report loop metadata
412   if (KMP_MASTER_TID(tid) && __itt_metadata_add_ptr &&
413       __kmp_forkjoin_frames_mode == 3 && th->th.th_teams_microtask == NULL &&
414       team->t.t_active_level == 1) {
415     kmp_uint64 cur_chunk = chunk;
416     check_loc(loc);
417     // Calculate chunk in case it was not specified; it is specified for
418     // kmp_sch_static_chunked
419     if (schedtype == kmp_sch_static) {
420       cur_chunk = trip_count / nth + ((trip_count % nth) ? 1 : 0);
421     }
422     // 0 - "static" schedule
423     __kmp_itt_metadata_loop(loc, 0, trip_count, cur_chunk);
424   }
425 #endif
426 #ifdef KMP_DEBUG
427   {
428     char *buff;
429     // create format specifiers before the debug output
430     buff = __kmp_str_format("__kmpc_for_static_init: liter=%%d lower=%%%s "
431                             "upper=%%%s stride = %%%s signed?<%s>\n",
432                             traits_t<T>::spec, traits_t<T>::spec,
433                             traits_t<ST>::spec, traits_t<T>::spec);
434     KD_TRACE(100, (buff, *plastiter, *plower, *pupper, *pstride));
435     __kmp_str_free(&buff);
436   }
437 #endif
438   KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
439 
440 #if OMPT_SUPPORT && OMPT_OPTIONAL
441   if (ompt_enabled.ompt_callback_work) {
442     ompt_callbacks.ompt_callback(ompt_callback_work)(
443         ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
444         &(task_info->task_data), trip_count, codeptr);
445   }
446   if (ompt_enabled.ompt_callback_dispatch) {
447     ompt_dispatch_t dispatch_type;
448     ompt_data_t instance = ompt_data_none;
449     ompt_dispatch_chunk_t dispatch_chunk;
450     if (ompt_work_type == ompt_work_sections) {
451       dispatch_type = ompt_dispatch_section;
452       instance.ptr = codeptr;
453     } else {
454       OMPT_GET_DISPATCH_CHUNK(dispatch_chunk, *plower, *pupper, incr);
455       dispatch_type = (ompt_work_type == ompt_work_distribute)
456                           ? ompt_dispatch_distribute_chunk
457                           : ompt_dispatch_ws_loop_chunk;
458       instance.ptr = &dispatch_chunk;
459     }
460     ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
461         &(team_info->parallel_data), &(task_info->task_data), dispatch_type,
462         instance);
463   }
464 #endif
465 
466   KMP_STATS_LOOP_END(OMP_loop_static_iterations);
467   return;
468 }
469 
470 template <typename T>
471 static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid,
472                                        kmp_int32 schedule, kmp_int32 *plastiter,
473                                        T *plower, T *pupper, T *pupperDist,
474                                        typename traits_t<T>::signed_t *pstride,
475                                        typename traits_t<T>::signed_t incr,
476                                        typename traits_t<T>::signed_t chunk
477 #if OMPT_SUPPORT && OMPT_OPTIONAL
478                                        ,
479                                        void *codeptr
480 #endif
481 ) {
482   KMP_COUNT_BLOCK(OMP_DISTRIBUTE);
483   KMP_PUSH_PARTITIONED_TIMER(OMP_distribute);
484   KMP_PUSH_PARTITIONED_TIMER(OMP_distribute_scheduling);
485   typedef typename traits_t<T>::unsigned_t UT;
486   typedef typename traits_t<T>::signed_t ST;
487   kmp_uint32 tid;
488   kmp_uint32 nth;
489   kmp_uint32 team_id;
490   kmp_uint32 nteams;
491   UT trip_count;
492   kmp_team_t *team;
493   kmp_info_t *th;
494 
495   KMP_DEBUG_ASSERT(plastiter && plower && pupper && pupperDist && pstride);
496   KE_TRACE(10, ("__kmpc_dist_for_static_init called (%d)\n", gtid));
497   __kmp_assert_valid_gtid(gtid);
498 #ifdef KMP_DEBUG
499   {
500     char *buff;
501     // create format specifiers before the debug output
502     buff = __kmp_str_format(
503         "__kmpc_dist_for_static_init: T#%%d schedLoop=%%d liter=%%d "
504         "iter=(%%%s, %%%s, %%%s) chunk=%%%s signed?<%s>\n",
505         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec,
506         traits_t<ST>::spec, traits_t<T>::spec);
507     KD_TRACE(100,
508              (buff, gtid, schedule, *plastiter, *plower, *pupper, incr, chunk));
509     __kmp_str_free(&buff);
510   }
511 #endif
512 
513   if (__kmp_env_consistency_check) {
514     __kmp_push_workshare(gtid, ct_pdo, loc);
515     if (incr == 0) {
516       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
517                             loc);
518     }
519     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
520       // The loop is illegal.
521       // Some zero-trip loops maintained by compiler, e.g.:
522       //   for(i=10;i<0;++i) // lower >= upper - run-time check
523       //   for(i=0;i>10;--i) // lower <= upper - run-time check
524       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
525       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
526       // Compiler does not check the following illegal loops:
527       //   for(i=0;i<10;i+=incr) // where incr<0
528       //   for(i=10;i>0;i-=incr) // where incr<0
529       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
530     }
531   }
532   tid = __kmp_tid_from_gtid(gtid);
533   th = __kmp_threads[gtid];
534   nth = th->th.th_team_nproc;
535   team = th->th.th_team;
536   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
537   nteams = th->th.th_teams_size.nteams;
538   team_id = team->t.t_master_tid;
539   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
540 
541   // compute global trip count
542   if (incr == 1) {
543     trip_count = *pupper - *plower + 1;
544   } else if (incr == -1) {
545     trip_count = *plower - *pupper + 1;
546   } else if (incr > 0) {
547     // upper-lower can exceed the limit of signed type
548     trip_count = (UT)(*pupper - *plower) / incr + 1;
549   } else {
550     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
551   }
552 
553   *pstride = *pupper - *plower; // just in case (can be unused)
554   if (trip_count <= nteams) {
555     KMP_DEBUG_ASSERT(
556         __kmp_static == kmp_sch_static_greedy ||
557         __kmp_static ==
558             kmp_sch_static_balanced); // Unknown static scheduling type.
559     // only primary threads of some teams get single iteration, other threads
560     // get nothing
561     if (team_id < trip_count && tid == 0) {
562       *pupper = *pupperDist = *plower = *plower + team_id * incr;
563     } else {
564       *pupperDist = *pupper;
565       *plower = *pupper + incr; // compiler should skip loop body
566     }
567     if (plastiter != NULL)
568       *plastiter = (tid == 0 && team_id == trip_count - 1);
569   } else {
570     // Get the team's chunk first (each team gets at most one chunk)
571     if (__kmp_static == kmp_sch_static_balanced) {
572       UT chunkD = trip_count / nteams;
573       UT extras = trip_count % nteams;
574       *plower +=
575           incr * (team_id * chunkD + (team_id < extras ? team_id : extras));
576       *pupperDist = *plower + chunkD * incr - (team_id < extras ? 0 : incr);
577       if (plastiter != NULL)
578         *plastiter = (team_id == nteams - 1);
579     } else {
580       T chunk_inc_count =
581           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
582       T upper = *pupper;
583       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
584       // Unknown static scheduling type.
585       *plower += team_id * chunk_inc_count;
586       *pupperDist = *plower + chunk_inc_count - incr;
587       // Check/correct bounds if needed
588       if (incr > 0) {
589         if (*pupperDist < *plower)
590           *pupperDist = traits_t<T>::max_value;
591         if (plastiter != NULL)
592           *plastiter = *plower <= upper && *pupperDist > upper - incr;
593         if (*pupperDist > upper)
594           *pupperDist = upper; // tracker C73258
595         if (*plower > *pupperDist) {
596           *pupper = *pupperDist; // no iterations available for the team
597           goto end;
598         }
599       } else {
600         if (*pupperDist > *plower)
601           *pupperDist = traits_t<T>::min_value;
602         if (plastiter != NULL)
603           *plastiter = *plower >= upper && *pupperDist < upper - incr;
604         if (*pupperDist < upper)
605           *pupperDist = upper; // tracker C73258
606         if (*plower < *pupperDist) {
607           *pupper = *pupperDist; // no iterations available for the team
608           goto end;
609         }
610       }
611     }
612     // Get the parallel loop chunk now (for thread)
613     // compute trip count for team's chunk
614     if (incr == 1) {
615       trip_count = *pupperDist - *plower + 1;
616     } else if (incr == -1) {
617       trip_count = *plower - *pupperDist + 1;
618     } else if (incr > 1) {
619       // upper-lower can exceed the limit of signed type
620       trip_count = (UT)(*pupperDist - *plower) / incr + 1;
621     } else {
622       trip_count = (UT)(*plower - *pupperDist) / (-incr) + 1;
623     }
624     KMP_DEBUG_ASSERT(trip_count);
625     switch (schedule) {
626     case kmp_sch_static: {
627       if (trip_count <= nth) {
628         KMP_DEBUG_ASSERT(
629             __kmp_static == kmp_sch_static_greedy ||
630             __kmp_static ==
631                 kmp_sch_static_balanced); // Unknown static scheduling type.
632         if (tid < trip_count)
633           *pupper = *plower = *plower + tid * incr;
634         else
635           *plower = *pupper + incr; // no iterations available
636         if (plastiter != NULL)
637           if (*plastiter != 0 && !(tid == trip_count - 1))
638             *plastiter = 0;
639       } else {
640         if (__kmp_static == kmp_sch_static_balanced) {
641           UT chunkL = trip_count / nth;
642           UT extras = trip_count % nth;
643           *plower += incr * (tid * chunkL + (tid < extras ? tid : extras));
644           *pupper = *plower + chunkL * incr - (tid < extras ? 0 : incr);
645           if (plastiter != NULL)
646             if (*plastiter != 0 && !(tid == nth - 1))
647               *plastiter = 0;
648         } else {
649           T chunk_inc_count =
650               (trip_count / nth + ((trip_count % nth) ? 1 : 0)) * incr;
651           T upper = *pupperDist;
652           KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
653           // Unknown static scheduling type.
654           *plower += tid * chunk_inc_count;
655           *pupper = *plower + chunk_inc_count - incr;
656           if (incr > 0) {
657             if (*pupper < *plower)
658               *pupper = traits_t<T>::max_value;
659             if (plastiter != NULL)
660               if (*plastiter != 0 &&
661                   !(*plower <= upper && *pupper > upper - incr))
662                 *plastiter = 0;
663             if (*pupper > upper)
664               *pupper = upper; // tracker C73258
665           } else {
666             if (*pupper > *plower)
667               *pupper = traits_t<T>::min_value;
668             if (plastiter != NULL)
669               if (*plastiter != 0 &&
670                   !(*plower >= upper && *pupper < upper - incr))
671                 *plastiter = 0;
672             if (*pupper < upper)
673               *pupper = upper; // tracker C73258
674           }
675         }
676       }
677       break;
678     }
679     case kmp_sch_static_chunked: {
680       ST span;
681       if (chunk < 1)
682         chunk = 1;
683       span = chunk * incr;
684       *pstride = span * nth;
685       *plower = *plower + (span * tid);
686       *pupper = *plower + span - incr;
687       if (plastiter != NULL)
688         if (*plastiter != 0 && !(tid == ((trip_count - 1) / (UT)chunk) % nth))
689           *plastiter = 0;
690       break;
691     }
692     default:
693       KMP_ASSERT2(0,
694                   "__kmpc_dist_for_static_init: unknown loop scheduling type");
695       break;
696     }
697   }
698 end:;
699 #ifdef KMP_DEBUG
700   {
701     char *buff;
702     // create format specifiers before the debug output
703     buff = __kmp_str_format(
704         "__kmpc_dist_for_static_init: last=%%d lo=%%%s up=%%%s upDist=%%%s "
705         "stride=%%%s signed?<%s>\n",
706         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec,
707         traits_t<ST>::spec, traits_t<T>::spec);
708     KD_TRACE(100, (buff, *plastiter, *plower, *pupper, *pupperDist, *pstride));
709     __kmp_str_free(&buff);
710   }
711 #endif
712   KE_TRACE(10, ("__kmpc_dist_for_static_init: T#%d return\n", gtid));
713 #if OMPT_SUPPORT && OMPT_OPTIONAL
714   if (ompt_enabled.ompt_callback_work || ompt_enabled.ompt_callback_dispatch) {
715     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
716     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
717     if (ompt_enabled.ompt_callback_work) {
718       ompt_callbacks.ompt_callback(ompt_callback_work)(
719           ompt_work_distribute, ompt_scope_begin, &(team_info->parallel_data),
720           &(task_info->task_data), 0, codeptr);
721     }
722     if (ompt_enabled.ompt_callback_dispatch) {
723       ompt_data_t instance = ompt_data_none;
724       ompt_dispatch_chunk_t dispatch_chunk;
725       OMPT_GET_DISPATCH_CHUNK(dispatch_chunk, *plower, *pupperDist, incr);
726       instance.ptr = &dispatch_chunk;
727       ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
728           &(team_info->parallel_data), &(task_info->task_data),
729           ompt_dispatch_distribute_chunk, instance);
730     }
731   }
732 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
733   KMP_STATS_LOOP_END(OMP_distribute_iterations);
734   return;
735 }
736 
737 template <typename T>
738 static void __kmp_team_static_init(ident_t *loc, kmp_int32 gtid,
739                                    kmp_int32 *p_last, T *p_lb, T *p_ub,
740                                    typename traits_t<T>::signed_t *p_st,
741                                    typename traits_t<T>::signed_t incr,
742                                    typename traits_t<T>::signed_t chunk) {
743   // The routine returns the first chunk distributed to the team and
744   // stride for next chunks calculation.
745   // Last iteration flag set for the team that will execute
746   // the last iteration of the loop.
747   // The routine is called for dist_schedule(static,chunk) only.
748   typedef typename traits_t<T>::unsigned_t UT;
749   typedef typename traits_t<T>::signed_t ST;
750   kmp_uint32 team_id;
751   kmp_uint32 nteams;
752   UT trip_count;
753   T lower;
754   T upper;
755   ST span;
756   kmp_team_t *team;
757   kmp_info_t *th;
758 
759   KMP_DEBUG_ASSERT(p_last && p_lb && p_ub && p_st);
760   KE_TRACE(10, ("__kmp_team_static_init called (%d)\n", gtid));
761   __kmp_assert_valid_gtid(gtid);
762 #ifdef KMP_DEBUG
763   {
764     char *buff;
765     // create format specifiers before the debug output
766     buff = __kmp_str_format("__kmp_team_static_init enter: T#%%d liter=%%d "
767                             "iter=(%%%s, %%%s, %%%s) chunk %%%s; signed?<%s>\n",
768                             traits_t<T>::spec, traits_t<T>::spec,
769                             traits_t<ST>::spec, traits_t<ST>::spec,
770                             traits_t<T>::spec);
771     KD_TRACE(100, (buff, gtid, *p_last, *p_lb, *p_ub, *p_st, chunk));
772     __kmp_str_free(&buff);
773   }
774 #endif
775 
776   lower = *p_lb;
777   upper = *p_ub;
778   if (__kmp_env_consistency_check) {
779     if (incr == 0) {
780       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
781                             loc);
782     }
783     if (incr > 0 ? (upper < lower) : (lower < upper)) {
784       // The loop is illegal.
785       // Some zero-trip loops maintained by compiler, e.g.:
786       //   for(i=10;i<0;++i) // lower >= upper - run-time check
787       //   for(i=0;i>10;--i) // lower <= upper - run-time check
788       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
789       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
790       // Compiler does not check the following illegal loops:
791       //   for(i=0;i<10;i+=incr) // where incr<0
792       //   for(i=10;i>0;i-=incr) // where incr<0
793       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
794     }
795   }
796   th = __kmp_threads[gtid];
797   team = th->th.th_team;
798   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
799   nteams = th->th.th_teams_size.nteams;
800   team_id = team->t.t_master_tid;
801   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
802 
803   // compute trip count
804   if (incr == 1) {
805     trip_count = upper - lower + 1;
806   } else if (incr == -1) {
807     trip_count = lower - upper + 1;
808   } else if (incr > 0) {
809     // upper-lower can exceed the limit of signed type
810     trip_count = (UT)(upper - lower) / incr + 1;
811   } else {
812     trip_count = (UT)(lower - upper) / (-incr) + 1;
813   }
814   if (chunk < 1)
815     chunk = 1;
816   span = chunk * incr;
817   *p_st = span * nteams;
818   *p_lb = lower + (span * team_id);
819   *p_ub = *p_lb + span - incr;
820   if (p_last != NULL)
821     *p_last = (team_id == ((trip_count - 1) / (UT)chunk) % nteams);
822   // Correct upper bound if needed
823   if (incr > 0) {
824     if (*p_ub < *p_lb) // overflow?
825       *p_ub = traits_t<T>::max_value;
826     if (*p_ub > upper)
827       *p_ub = upper; // tracker C73258
828   } else { // incr < 0
829     if (*p_ub > *p_lb)
830       *p_ub = traits_t<T>::min_value;
831     if (*p_ub < upper)
832       *p_ub = upper; // tracker C73258
833   }
834 #ifdef KMP_DEBUG
835   {
836     char *buff;
837     // create format specifiers before the debug output
838     buff =
839         __kmp_str_format("__kmp_team_static_init exit: T#%%d team%%u liter=%%d "
840                          "iter=(%%%s, %%%s, %%%s) chunk %%%s\n",
841                          traits_t<T>::spec, traits_t<T>::spec,
842                          traits_t<ST>::spec, traits_t<ST>::spec);
843     KD_TRACE(100, (buff, gtid, team_id, *p_last, *p_lb, *p_ub, *p_st, chunk));
844     __kmp_str_free(&buff);
845   }
846 #endif
847 }
848 
849 //------------------------------------------------------------------------------
850 extern "C" {
851 /*!
852 @ingroup WORK_SHARING
853 @param    loc       Source code location
854 @param    gtid      Global thread id of this thread
855 @param    schedtype  Scheduling type
856 @param    plastiter Pointer to the "last iteration" flag
857 @param    plower    Pointer to the lower bound
858 @param    pupper    Pointer to the upper bound
859 @param    pstride   Pointer to the stride
860 @param    incr      Loop increment
861 @param    chunk     The chunk size
862 
863 Each of the four functions here are identical apart from the argument types.
864 
865 The functions compute the upper and lower bounds and stride to be used for the
866 set of iterations to be executed by the current thread from the statically
867 scheduled loop that is described by the initial values of the bounds, stride,
868 increment and chunk size.
869 
870 @{
871 */
872 void __kmpc_for_static_init_4(ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype,
873                               kmp_int32 *plastiter, kmp_int32 *plower,
874                               kmp_int32 *pupper, kmp_int32 *pstride,
875                               kmp_int32 incr, kmp_int32 chunk) {
876   __kmp_for_static_init<kmp_int32>(loc, gtid, schedtype, plastiter, plower,
877                                    pupper, pstride, incr, chunk
878 #if OMPT_SUPPORT && OMPT_OPTIONAL
879                                    ,
880                                    OMPT_GET_RETURN_ADDRESS(0)
881 #endif
882   );
883 }
884 
885 /*!
886  See @ref __kmpc_for_static_init_4
887  */
888 void __kmpc_for_static_init_4u(ident_t *loc, kmp_int32 gtid,
889                                kmp_int32 schedtype, kmp_int32 *plastiter,
890                                kmp_uint32 *plower, kmp_uint32 *pupper,
891                                kmp_int32 *pstride, kmp_int32 incr,
892                                kmp_int32 chunk) {
893   __kmp_for_static_init<kmp_uint32>(loc, gtid, schedtype, plastiter, plower,
894                                     pupper, pstride, incr, chunk
895 #if OMPT_SUPPORT && OMPT_OPTIONAL
896                                     ,
897                                     OMPT_GET_RETURN_ADDRESS(0)
898 #endif
899   );
900 }
901 
902 /*!
903  See @ref __kmpc_for_static_init_4
904  */
905 void __kmpc_for_static_init_8(ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype,
906                               kmp_int32 *plastiter, kmp_int64 *plower,
907                               kmp_int64 *pupper, kmp_int64 *pstride,
908                               kmp_int64 incr, kmp_int64 chunk) {
909   __kmp_for_static_init<kmp_int64>(loc, gtid, schedtype, plastiter, plower,
910                                    pupper, pstride, incr, chunk
911 #if OMPT_SUPPORT && OMPT_OPTIONAL
912                                    ,
913                                    OMPT_GET_RETURN_ADDRESS(0)
914 #endif
915   );
916 }
917 
918 /*!
919  See @ref __kmpc_for_static_init_4
920  */
921 void __kmpc_for_static_init_8u(ident_t *loc, kmp_int32 gtid,
922                                kmp_int32 schedtype, kmp_int32 *plastiter,
923                                kmp_uint64 *plower, kmp_uint64 *pupper,
924                                kmp_int64 *pstride, kmp_int64 incr,
925                                kmp_int64 chunk) {
926   __kmp_for_static_init<kmp_uint64>(loc, gtid, schedtype, plastiter, plower,
927                                     pupper, pstride, incr, chunk
928 #if OMPT_SUPPORT && OMPT_OPTIONAL
929                                     ,
930                                     OMPT_GET_RETURN_ADDRESS(0)
931 #endif
932   );
933 }
934 /*!
935 @}
936 */
937 
938 #if OMPT_SUPPORT && OMPT_OPTIONAL
939 #define OMPT_CODEPTR_ARG , OMPT_GET_RETURN_ADDRESS(0)
940 #else
941 #define OMPT_CODEPTR_ARG
942 #endif
943 
944 /*!
945 @ingroup WORK_SHARING
946 @param    loc       Source code location
947 @param    gtid      Global thread id of this thread
948 @param    schedule  Scheduling type for the parallel loop
949 @param    plastiter Pointer to the "last iteration" flag
950 @param    plower    Pointer to the lower bound
951 @param    pupper    Pointer to the upper bound of loop chunk
952 @param    pupperD   Pointer to the upper bound of dist_chunk
953 @param    pstride   Pointer to the stride for parallel loop
954 @param    incr      Loop increment
955 @param    chunk     The chunk size for the parallel loop
956 
957 Each of the four functions here are identical apart from the argument types.
958 
959 The functions compute the upper and lower bounds and strides to be used for the
960 set of iterations to be executed by the current thread from the statically
961 scheduled loop that is described by the initial values of the bounds, strides,
962 increment and chunks for parallel loop and distribute constructs.
963 
964 @{
965 */
966 void __kmpc_dist_for_static_init_4(ident_t *loc, kmp_int32 gtid,
967                                    kmp_int32 schedule, kmp_int32 *plastiter,
968                                    kmp_int32 *plower, kmp_int32 *pupper,
969                                    kmp_int32 *pupperD, kmp_int32 *pstride,
970                                    kmp_int32 incr, kmp_int32 chunk) {
971   __kmp_dist_for_static_init<kmp_int32>(loc, gtid, schedule, plastiter, plower,
972                                         pupper, pupperD, pstride, incr,
973                                         chunk OMPT_CODEPTR_ARG);
974 }
975 
976 /*!
977  See @ref __kmpc_dist_for_static_init_4
978  */
979 void __kmpc_dist_for_static_init_4u(ident_t *loc, kmp_int32 gtid,
980                                     kmp_int32 schedule, kmp_int32 *plastiter,
981                                     kmp_uint32 *plower, kmp_uint32 *pupper,
982                                     kmp_uint32 *pupperD, kmp_int32 *pstride,
983                                     kmp_int32 incr, kmp_int32 chunk) {
984   __kmp_dist_for_static_init<kmp_uint32>(loc, gtid, schedule, plastiter, plower,
985                                          pupper, pupperD, pstride, incr,
986                                          chunk OMPT_CODEPTR_ARG);
987 }
988 
989 /*!
990  See @ref __kmpc_dist_for_static_init_4
991  */
992 void __kmpc_dist_for_static_init_8(ident_t *loc, kmp_int32 gtid,
993                                    kmp_int32 schedule, kmp_int32 *plastiter,
994                                    kmp_int64 *plower, kmp_int64 *pupper,
995                                    kmp_int64 *pupperD, kmp_int64 *pstride,
996                                    kmp_int64 incr, kmp_int64 chunk) {
997   __kmp_dist_for_static_init<kmp_int64>(loc, gtid, schedule, plastiter, plower,
998                                         pupper, pupperD, pstride, incr,
999                                         chunk OMPT_CODEPTR_ARG);
1000 }
1001 
1002 /*!
1003  See @ref __kmpc_dist_for_static_init_4
1004  */
1005 void __kmpc_dist_for_static_init_8u(ident_t *loc, kmp_int32 gtid,
1006                                     kmp_int32 schedule, kmp_int32 *plastiter,
1007                                     kmp_uint64 *plower, kmp_uint64 *pupper,
1008                                     kmp_uint64 *pupperD, kmp_int64 *pstride,
1009                                     kmp_int64 incr, kmp_int64 chunk) {
1010   __kmp_dist_for_static_init<kmp_uint64>(loc, gtid, schedule, plastiter, plower,
1011                                          pupper, pupperD, pstride, incr,
1012                                          chunk OMPT_CODEPTR_ARG);
1013 }
1014 /*!
1015 @}
1016 */
1017 
1018 //------------------------------------------------------------------------------
1019 // Auxiliary routines for Distribute Parallel Loop construct implementation
1020 //    Transfer call to template< type T >
1021 //    __kmp_team_static_init( ident_t *loc, int gtid,
1022 //        int *p_last, T *lb, T *ub, ST *st, ST incr, ST chunk )
1023 
1024 /*!
1025 @ingroup WORK_SHARING
1026 @{
1027 @param loc Source location
1028 @param gtid Global thread id
1029 @param p_last pointer to last iteration flag
1030 @param p_lb  pointer to Lower bound
1031 @param p_ub  pointer to Upper bound
1032 @param p_st  Step (or increment if you prefer)
1033 @param incr  Loop increment
1034 @param chunk The chunk size to block with
1035 
1036 The functions compute the upper and lower bounds and stride to be used for the
1037 set of iterations to be executed by the current team from the statically
1038 scheduled loop that is described by the initial values of the bounds, stride,
1039 increment and chunk for the distribute construct as part of composite distribute
1040 parallel loop construct. These functions are all identical apart from the types
1041 of the arguments.
1042 */
1043 
1044 void __kmpc_team_static_init_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
1045                                kmp_int32 *p_lb, kmp_int32 *p_ub,
1046                                kmp_int32 *p_st, kmp_int32 incr,
1047                                kmp_int32 chunk) {
1048   KMP_DEBUG_ASSERT(__kmp_init_serial);
1049   __kmp_team_static_init<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st, incr,
1050                                     chunk);
1051 }
1052 
1053 /*!
1054  See @ref __kmpc_team_static_init_4
1055  */
1056 void __kmpc_team_static_init_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
1057                                 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
1058                                 kmp_int32 *p_st, kmp_int32 incr,
1059                                 kmp_int32 chunk) {
1060   KMP_DEBUG_ASSERT(__kmp_init_serial);
1061   __kmp_team_static_init<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st, incr,
1062                                      chunk);
1063 }
1064 
1065 /*!
1066  See @ref __kmpc_team_static_init_4
1067  */
1068 void __kmpc_team_static_init_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
1069                                kmp_int64 *p_lb, kmp_int64 *p_ub,
1070                                kmp_int64 *p_st, kmp_int64 incr,
1071                                kmp_int64 chunk) {
1072   KMP_DEBUG_ASSERT(__kmp_init_serial);
1073   __kmp_team_static_init<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st, incr,
1074                                     chunk);
1075 }
1076 
1077 /*!
1078  See @ref __kmpc_team_static_init_4
1079  */
1080 void __kmpc_team_static_init_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
1081                                 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
1082                                 kmp_int64 *p_st, kmp_int64 incr,
1083                                 kmp_int64 chunk) {
1084   KMP_DEBUG_ASSERT(__kmp_init_serial);
1085   __kmp_team_static_init<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st, incr,
1086                                      chunk);
1087 }
1088 /*!
1089 @}
1090 */
1091 
1092 } // extern "C"
1093