xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_dispatch.cpp (revision 4b50c451720d8b427757a6da1dd2bb4c52cd9e35)
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(enum sched_type schedule,
73                                          bool use_hier = false) {
74   // Pick up the nonmonotonic/monotonic bits from the scheduling type
75   int monotonicity;
76   // default to monotonic
77   monotonicity = SCHEDULE_MONOTONIC;
78   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
79     monotonicity = SCHEDULE_NONMONOTONIC;
80   else if (SCHEDULE_HAS_MONOTONIC(schedule))
81     monotonicity = SCHEDULE_MONOTONIC;
82   return monotonicity;
83 }
84 
85 // Initialize a dispatch_private_info_template<T> buffer for a particular
86 // type of schedule,chunk.  The loop description is found in lb (lower bound),
87 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
88 // to the scheduling (often the number of threads in a team, but not always if
89 // hierarchical scheduling is used).  tid is the id of the thread calling
90 // the function within the group of nproc threads.  It will have a value
91 // between 0 and nproc - 1.  This is often just the thread id within a team, but
92 // is not necessarily the case when using hierarchical scheduling.
93 // loc is the source file location of the corresponding loop
94 // gtid is the global thread id
95 template <typename T>
96 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
97                                    dispatch_private_info_template<T> *pr,
98                                    enum sched_type schedule, T lb, T ub,
99                                    typename traits_t<T>::signed_t st,
100 #if USE_ITT_BUILD
101                                    kmp_uint64 *cur_chunk,
102 #endif
103                                    typename traits_t<T>::signed_t chunk,
104                                    T nproc, T tid) {
105   typedef typename traits_t<T>::unsigned_t UT;
106   typedef typename traits_t<T>::floating_t DBL;
107 
108   int active;
109   T tc;
110   kmp_info_t *th;
111   kmp_team_t *team;
112   int monotonicity;
113   bool use_hier;
114 
115 #ifdef KMP_DEBUG
116   typedef typename traits_t<T>::signed_t ST;
117   {
118     char *buff;
119     // create format specifiers before the debug output
120     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
121                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
122                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
123                             traits_t<T>::spec, traits_t<T>::spec,
124                             traits_t<ST>::spec, traits_t<ST>::spec,
125                             traits_t<T>::spec, traits_t<T>::spec);
126     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
127     __kmp_str_free(&buff);
128   }
129 #endif
130   /* setup data */
131   th = __kmp_threads[gtid];
132   team = th->th.th_team;
133   active = !team->t.t_serialized;
134 
135 #if USE_ITT_BUILD
136   int itt_need_metadata_reporting =
137       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
138       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
139       team->t.t_active_level == 1;
140 #endif
141 
142 #if KMP_USE_HIER_SCHED
143   use_hier = pr->flags.use_hier;
144 #else
145   use_hier = false;
146 #endif
147 
148   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
149   monotonicity = __kmp_get_monotonicity(schedule, use_hier);
150   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
151 
152   /* Pick up the nomerge/ordered bits from the scheduling type */
153   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
154     pr->flags.nomerge = TRUE;
155     schedule =
156         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
157   } else {
158     pr->flags.nomerge = FALSE;
159   }
160   pr->type_size = traits_t<T>::type_size; // remember the size of variables
161   if (kmp_ord_lower & schedule) {
162     pr->flags.ordered = TRUE;
163     schedule =
164         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
165   } else {
166     pr->flags.ordered = FALSE;
167   }
168   // Ordered overrides nonmonotonic
169   if (pr->flags.ordered) {
170     monotonicity = SCHEDULE_MONOTONIC;
171   }
172 
173   if (schedule == kmp_sch_static) {
174     schedule = __kmp_static;
175   } else {
176     if (schedule == kmp_sch_runtime) {
177       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
178       // not specified)
179       schedule = team->t.t_sched.r_sched_type;
180       monotonicity = __kmp_get_monotonicity(schedule, use_hier);
181       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
182       // Detail the schedule if needed (global controls are differentiated
183       // appropriately)
184       if (schedule == kmp_sch_guided_chunked) {
185         schedule = __kmp_guided;
186       } else if (schedule == kmp_sch_static) {
187         schedule = __kmp_static;
188       }
189       // Use the chunk size specified by OMP_SCHEDULE (or default if not
190       // specified)
191       chunk = team->t.t_sched.chunk;
192 #if USE_ITT_BUILD
193       if (cur_chunk)
194         *cur_chunk = chunk;
195 #endif
196 #ifdef KMP_DEBUG
197       {
198         char *buff;
199         // create format specifiers before the debug output
200         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
201                                 "schedule:%%d chunk:%%%s\n",
202                                 traits_t<ST>::spec);
203         KD_TRACE(10, (buff, gtid, schedule, chunk));
204         __kmp_str_free(&buff);
205       }
206 #endif
207     } else {
208       if (schedule == kmp_sch_guided_chunked) {
209         schedule = __kmp_guided;
210       }
211       if (chunk <= 0) {
212         chunk = KMP_DEFAULT_CHUNK;
213       }
214     }
215 
216     if (schedule == kmp_sch_auto) {
217       // mapping and differentiation: in the __kmp_do_serial_initialize()
218       schedule = __kmp_auto;
219 #ifdef KMP_DEBUG
220       {
221         char *buff;
222         // create format specifiers before the debug output
223         buff = __kmp_str_format(
224             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
225             "schedule:%%d chunk:%%%s\n",
226             traits_t<ST>::spec);
227         KD_TRACE(10, (buff, gtid, schedule, chunk));
228         __kmp_str_free(&buff);
229       }
230 #endif
231     }
232 #if KMP_STATIC_STEAL_ENABLED
233     // map nonmonotonic:dynamic to static steal
234     if (schedule == kmp_sch_dynamic_chunked) {
235       if (monotonicity == SCHEDULE_NONMONOTONIC)
236         schedule = kmp_sch_static_steal;
237     }
238 #endif
239     /* guided analytical not safe for too many threads */
240     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
241       schedule = kmp_sch_guided_iterative_chunked;
242       KMP_WARNING(DispatchManyThreads);
243     }
244     if (schedule == kmp_sch_runtime_simd) {
245       // compiler provides simd_width in the chunk parameter
246       schedule = team->t.t_sched.r_sched_type;
247       monotonicity = __kmp_get_monotonicity(schedule, use_hier);
248       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
249       // Detail the schedule if needed (global controls are differentiated
250       // appropriately)
251       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
252           schedule == __kmp_static) {
253         schedule = kmp_sch_static_balanced_chunked;
254       } else {
255         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
256           schedule = kmp_sch_guided_simd;
257         }
258         chunk = team->t.t_sched.chunk * chunk;
259       }
260 #if USE_ITT_BUILD
261       if (cur_chunk)
262         *cur_chunk = chunk;
263 #endif
264 #ifdef KMP_DEBUG
265       {
266         char *buff;
267         // create format specifiers before the debug output
268         buff = __kmp_str_format(
269             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
270             " chunk:%%%s\n",
271             traits_t<ST>::spec);
272         KD_TRACE(10, (buff, gtid, schedule, chunk));
273         __kmp_str_free(&buff);
274       }
275 #endif
276     }
277     pr->u.p.parm1 = chunk;
278   }
279   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
280               "unknown scheduling type");
281 
282   pr->u.p.count = 0;
283 
284   if (__kmp_env_consistency_check) {
285     if (st == 0) {
286       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
287                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
288     }
289   }
290   // compute trip count
291   if (st == 1) { // most common case
292     if (ub >= lb) {
293       tc = ub - lb + 1;
294     } else { // ub < lb
295       tc = 0; // zero-trip
296     }
297   } else if (st < 0) {
298     if (lb >= ub) {
299       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
300       // where the division needs to be unsigned regardless of the result type
301       tc = (UT)(lb - ub) / (-st) + 1;
302     } else { // lb < ub
303       tc = 0; // zero-trip
304     }
305   } else { // st > 0
306     if (ub >= lb) {
307       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
308       // where the division needs to be unsigned regardless of the result type
309       tc = (UT)(ub - lb) / st + 1;
310     } else { // ub < lb
311       tc = 0; // zero-trip
312     }
313   }
314 
315 #if KMP_STATS_ENABLED
316   if (KMP_MASTER_GTID(gtid)) {
317     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
318   }
319 #endif
320 
321   pr->u.p.lb = lb;
322   pr->u.p.ub = ub;
323   pr->u.p.st = st;
324   pr->u.p.tc = tc;
325 
326 #if KMP_OS_WINDOWS
327   pr->u.p.last_upper = ub + st;
328 #endif /* KMP_OS_WINDOWS */
329 
330   /* NOTE: only the active parallel region(s) has active ordered sections */
331 
332   if (active) {
333     if (pr->flags.ordered) {
334       pr->ordered_bumped = 0;
335       pr->u.p.ordered_lower = 1;
336       pr->u.p.ordered_upper = 0;
337     }
338   }
339 
340   switch (schedule) {
341 #if (KMP_STATIC_STEAL_ENABLED)
342   case kmp_sch_static_steal: {
343     T ntc, init;
344 
345     KD_TRACE(100,
346              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
347               gtid));
348 
349     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
350     if (nproc > 1 && ntc >= nproc) {
351       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
352       T id = tid;
353       T small_chunk, extras;
354 
355       small_chunk = ntc / nproc;
356       extras = ntc % nproc;
357 
358       init = id * small_chunk + (id < extras ? id : extras);
359       pr->u.p.count = init;
360       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
361 
362       pr->u.p.parm2 = lb;
363       // parm3 is the number of times to attempt stealing which is
364       // proportional to the number of chunks per thread up until
365       // the maximum value of nproc.
366       pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
367       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
368       pr->u.p.st = st;
369       if (traits_t<T>::type_size > 4) {
370         // AC: TODO: check if 16-byte CAS available and use it to
371         // improve performance (probably wait for explicit request
372         // before spending time on this).
373         // For now use dynamically allocated per-thread lock,
374         // free memory in __kmp_dispatch_next when status==0.
375         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
376         th->th.th_dispatch->th_steal_lock =
377             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
378         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
379       }
380       break;
381     } else {
382       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
383                      "kmp_sch_static_balanced\n",
384                      gtid));
385       schedule = kmp_sch_static_balanced;
386       /* too few iterations: fall-through to kmp_sch_static_balanced */
387     } // if
388     /* FALL-THROUGH to static balanced */
389     KMP_FALLTHROUGH();
390   } // case
391 #endif
392   case kmp_sch_static_balanced: {
393     T init, limit;
394 
395     KD_TRACE(
396         100,
397         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
398          gtid));
399 
400     if (nproc > 1) {
401       T id = tid;
402 
403       if (tc < nproc) {
404         if (id < tc) {
405           init = id;
406           limit = id;
407           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
408         } else {
409           pr->u.p.count = 1; /* means no more chunks to execute */
410           pr->u.p.parm1 = FALSE;
411           break;
412         }
413       } else {
414         T small_chunk = tc / nproc;
415         T extras = tc % nproc;
416         init = id * small_chunk + (id < extras ? id : extras);
417         limit = init + small_chunk - (id < extras ? 0 : 1);
418         pr->u.p.parm1 = (id == nproc - 1);
419       }
420     } else {
421       if (tc > 0) {
422         init = 0;
423         limit = tc - 1;
424         pr->u.p.parm1 = TRUE;
425       } else {
426         // zero trip count
427         pr->u.p.count = 1; /* means no more chunks to execute */
428         pr->u.p.parm1 = FALSE;
429         break;
430       }
431     }
432 #if USE_ITT_BUILD
433     // Calculate chunk for metadata report
434     if (itt_need_metadata_reporting)
435       if (cur_chunk)
436         *cur_chunk = limit - init + 1;
437 #endif
438     if (st == 1) {
439       pr->u.p.lb = lb + init;
440       pr->u.p.ub = lb + limit;
441     } else {
442       // calculated upper bound, "ub" is user-defined upper bound
443       T ub_tmp = lb + limit * st;
444       pr->u.p.lb = lb + init * st;
445       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
446       // it exactly
447       if (st > 0) {
448         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
449       } else {
450         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
451       }
452     }
453     if (pr->flags.ordered) {
454       pr->u.p.ordered_lower = init;
455       pr->u.p.ordered_upper = limit;
456     }
457     break;
458   } // case
459   case kmp_sch_static_balanced_chunked: {
460     // similar to balanced, but chunk adjusted to multiple of simd width
461     T nth = nproc;
462     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
463                    " -> falling-through to static_greedy\n",
464                    gtid));
465     schedule = kmp_sch_static_greedy;
466     if (nth > 1)
467       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
468     else
469       pr->u.p.parm1 = tc;
470     break;
471   } // case
472   case kmp_sch_guided_simd:
473   case kmp_sch_guided_iterative_chunked: {
474     KD_TRACE(
475         100,
476         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
477          " case\n",
478          gtid));
479 
480     if (nproc > 1) {
481       if ((2L * chunk + 1) * nproc >= tc) {
482         /* chunk size too large, switch to dynamic */
483         schedule = kmp_sch_dynamic_chunked;
484       } else {
485         // when remaining iters become less than parm2 - switch to dynamic
486         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
487         *(double *)&pr->u.p.parm3 =
488             guided_flt_param / nproc; // may occupy parm3 and parm4
489       }
490     } else {
491       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
492                      "kmp_sch_static_greedy\n",
493                      gtid));
494       schedule = kmp_sch_static_greedy;
495       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
496       KD_TRACE(
497           100,
498           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
499            gtid));
500       pr->u.p.parm1 = tc;
501     } // if
502   } // case
503   break;
504   case kmp_sch_guided_analytical_chunked: {
505     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
506                    "kmp_sch_guided_analytical_chunked case\n",
507                    gtid));
508 
509     if (nproc > 1) {
510       if ((2L * chunk + 1) * nproc >= tc) {
511         /* chunk size too large, switch to dynamic */
512         schedule = kmp_sch_dynamic_chunked;
513       } else {
514         /* commonly used term: (2 nproc - 1)/(2 nproc) */
515         DBL x;
516 
517 #if KMP_USE_X87CONTROL
518         /* Linux* OS already has 64-bit computation by default for long double,
519            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
520            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
521            instead of the default 53-bit. Even though long double doesn't work
522            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
523            expected to impact the correctness of the algorithm, but this has not
524            been mathematically proven. */
525         // save original FPCW and set precision to 64-bit, as
526         // Windows* OS on IA-32 architecture defaults to 53-bit
527         unsigned int oldFpcw = _control87(0, 0);
528         _control87(_PC_64, _MCW_PC); // 0,0x30000
529 #endif
530         /* value used for comparison in solver for cross-over point */
531         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
532 
533         /* crossover point--chunk indexes equal to or greater than
534            this point switch to dynamic-style scheduling */
535         UT cross;
536 
537         /* commonly used term: (2 nproc - 1)/(2 nproc) */
538         x = (long double)1.0 - (long double)0.5 / nproc;
539 
540 #ifdef KMP_DEBUG
541         { // test natural alignment
542           struct _test_a {
543             char a;
544             union {
545               char b;
546               DBL d;
547             };
548           } t;
549           ptrdiff_t natural_alignment =
550               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
551           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
552           // long)natural_alignment );
553           KMP_DEBUG_ASSERT(
554               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
555         }
556 #endif // KMP_DEBUG
557 
558         /* save the term in thread private dispatch structure */
559         *(DBL *)&pr->u.p.parm3 = x;
560 
561         /* solve for the crossover point to the nearest integer i for which C_i
562            <= chunk */
563         {
564           UT left, right, mid;
565           long double p;
566 
567           /* estimate initial upper and lower bound */
568 
569           /* doesn't matter what value right is as long as it is positive, but
570              it affects performance of the solver */
571           right = 229;
572           p = __kmp_pow<UT>(x, right);
573           if (p > target) {
574             do {
575               p *= p;
576               right <<= 1;
577             } while (p > target && right < (1 << 27));
578             /* lower bound is previous (failed) estimate of upper bound */
579             left = right >> 1;
580           } else {
581             left = 0;
582           }
583 
584           /* bisection root-finding method */
585           while (left + 1 < right) {
586             mid = (left + right) / 2;
587             if (__kmp_pow<UT>(x, mid) > target) {
588               left = mid;
589             } else {
590               right = mid;
591             }
592           } // while
593           cross = right;
594         }
595         /* assert sanity of computed crossover point */
596         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
597                    __kmp_pow<UT>(x, cross) <= target);
598 
599         /* save the crossover point in thread private dispatch structure */
600         pr->u.p.parm2 = cross;
601 
602 // C75803
603 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
604 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
605 #else
606 #define GUIDED_ANALYTICAL_WORKAROUND (x)
607 #endif
608         /* dynamic-style scheduling offset */
609         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
610                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
611                         cross * chunk;
612 #if KMP_USE_X87CONTROL
613         // restore FPCW
614         _control87(oldFpcw, _MCW_PC);
615 #endif
616       } // if
617     } else {
618       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
619                      "kmp_sch_static_greedy\n",
620                      gtid));
621       schedule = kmp_sch_static_greedy;
622       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
623       pr->u.p.parm1 = tc;
624     } // if
625   } // case
626   break;
627   case kmp_sch_static_greedy:
628     KD_TRACE(
629         100,
630         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
631          gtid));
632     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
633     break;
634   case kmp_sch_static_chunked:
635   case kmp_sch_dynamic_chunked:
636     if (pr->u.p.parm1 <= 0) {
637       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
638     }
639     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
640                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
641                    gtid));
642     break;
643   case kmp_sch_trapezoidal: {
644     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
645 
646     T parm1, parm2, parm3, parm4;
647     KD_TRACE(100,
648              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
649               gtid));
650 
651     parm1 = chunk;
652 
653     /* F : size of the first cycle */
654     parm2 = (tc / (2 * nproc));
655 
656     if (parm2 < 1) {
657       parm2 = 1;
658     }
659 
660     /* L : size of the last cycle.  Make sure the last cycle is not larger
661        than the first cycle. */
662     if (parm1 < 1) {
663       parm1 = 1;
664     } else if (parm1 > parm2) {
665       parm1 = parm2;
666     }
667 
668     /* N : number of cycles */
669     parm3 = (parm2 + parm1);
670     parm3 = (2 * tc + parm3 - 1) / parm3;
671 
672     if (parm3 < 2) {
673       parm3 = 2;
674     }
675 
676     /* sigma : decreasing incr of the trapezoid */
677     parm4 = (parm3 - 1);
678     parm4 = (parm2 - parm1) / parm4;
679 
680     // pointless check, because parm4 >= 0 always
681     // if ( parm4 < 0 ) {
682     //    parm4 = 0;
683     //}
684 
685     pr->u.p.parm1 = parm1;
686     pr->u.p.parm2 = parm2;
687     pr->u.p.parm3 = parm3;
688     pr->u.p.parm4 = parm4;
689   } // case
690   break;
691 
692   default: {
693     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
694                 KMP_HNT(GetNewerLibrary), // Hint
695                 __kmp_msg_null // Variadic argument list terminator
696                 );
697   } break;
698   } // switch
699   pr->schedule = schedule;
700 }
701 
702 #if KMP_USE_HIER_SCHED
703 template <typename T>
704 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
705                                              typename traits_t<T>::signed_t st);
706 template <>
707 inline void
708 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
709                                             kmp_int32 ub, kmp_int32 st) {
710   __kmp_dispatch_init_hierarchy<kmp_int32>(
711       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
712       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
713 }
714 template <>
715 inline void
716 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
717                                              kmp_uint32 ub, kmp_int32 st) {
718   __kmp_dispatch_init_hierarchy<kmp_uint32>(
719       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
720       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
721 }
722 template <>
723 inline void
724 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
725                                             kmp_int64 ub, kmp_int64 st) {
726   __kmp_dispatch_init_hierarchy<kmp_int64>(
727       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
728       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
729 }
730 template <>
731 inline void
732 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
733                                              kmp_uint64 ub, kmp_int64 st) {
734   __kmp_dispatch_init_hierarchy<kmp_uint64>(
735       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
736       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
737 }
738 
739 // free all the hierarchy scheduling memory associated with the team
740 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
741   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
742   for (int i = 0; i < num_disp_buff; ++i) {
743     // type does not matter here so use kmp_int32
744     auto sh =
745         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
746             &team->t.t_disp_buffer[i]);
747     if (sh->hier) {
748       sh->hier->deallocate();
749       __kmp_free(sh->hier);
750     }
751   }
752 }
753 #endif
754 
755 // UT - unsigned flavor of T, ST - signed flavor of T,
756 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
757 template <typename T>
758 static void
759 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
760                     T ub, typename traits_t<T>::signed_t st,
761                     typename traits_t<T>::signed_t chunk, int push_ws) {
762   typedef typename traits_t<T>::unsigned_t UT;
763 
764   int active;
765   kmp_info_t *th;
766   kmp_team_t *team;
767   kmp_uint32 my_buffer_index;
768   dispatch_private_info_template<T> *pr;
769   dispatch_shared_info_template<T> volatile *sh;
770 
771   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
772                    sizeof(dispatch_private_info));
773   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
774                    sizeof(dispatch_shared_info));
775 
776   if (!TCR_4(__kmp_init_parallel))
777     __kmp_parallel_initialize();
778 
779   __kmp_resume_if_soft_paused();
780 
781 #if INCLUDE_SSC_MARKS
782   SSC_MARK_DISPATCH_INIT();
783 #endif
784 #ifdef KMP_DEBUG
785   typedef typename traits_t<T>::signed_t ST;
786   {
787     char *buff;
788     // create format specifiers before the debug output
789     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
790                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
791                             traits_t<ST>::spec, traits_t<T>::spec,
792                             traits_t<T>::spec, traits_t<ST>::spec);
793     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
794     __kmp_str_free(&buff);
795   }
796 #endif
797   /* setup data */
798   th = __kmp_threads[gtid];
799   team = th->th.th_team;
800   active = !team->t.t_serialized;
801   th->th.th_ident = loc;
802 
803   // Any half-decent optimizer will remove this test when the blocks are empty
804   // since the macros expand to nothing
805   // when statistics are disabled.
806   if (schedule == __kmp_static) {
807     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
808   } else {
809     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
810   }
811 
812 #if KMP_USE_HIER_SCHED
813   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
814   // Hierarchical scheduling does not work with ordered, so if ordered is
815   // detected, then revert back to threaded scheduling.
816   bool ordered;
817   enum sched_type my_sched = schedule;
818   my_buffer_index = th->th.th_dispatch->th_disp_index;
819   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
820       &th->th.th_dispatch
821            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
822   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
823   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
824     my_sched =
825         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
826   ordered = (kmp_ord_lower & my_sched);
827   if (pr->flags.use_hier) {
828     if (ordered) {
829       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
830                      "Disabling hierarchical scheduling.\n",
831                      gtid));
832       pr->flags.use_hier = FALSE;
833     }
834   }
835   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
836     // Don't use hierarchical for ordered parallel loops and don't
837     // use the runtime hierarchy if one was specified in the program
838     if (!ordered && !pr->flags.use_hier)
839       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
840   }
841 #endif // KMP_USE_HIER_SCHED
842 
843 #if USE_ITT_BUILD
844   kmp_uint64 cur_chunk = chunk;
845   int itt_need_metadata_reporting =
846       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
847       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
848       team->t.t_active_level == 1;
849 #endif
850   if (!active) {
851     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
852         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
853   } else {
854     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
855                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
856 
857     my_buffer_index = th->th.th_dispatch->th_disp_index++;
858 
859     /* What happens when number of threads changes, need to resize buffer? */
860     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
861         &th->th.th_dispatch
862              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
863     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
864         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
865     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
866                   my_buffer_index));
867   }
868 
869   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
870 #if USE_ITT_BUILD
871                                 &cur_chunk,
872 #endif
873                                 chunk, (T)th->th.th_team_nproc,
874                                 (T)th->th.th_info.ds.ds_tid);
875   if (active) {
876     if (pr->flags.ordered == 0) {
877       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
878       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
879     } else {
880       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
881       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
882     }
883   }
884 
885   if (active) {
886     /* The name of this buffer should be my_buffer_index when it's free to use
887      * it */
888 
889     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
890                    "sh->buffer_index:%d\n",
891                    gtid, my_buffer_index, sh->buffer_index));
892     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
893                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
894     // Note: KMP_WAIT() cannot be used there: buffer index and
895     // my_buffer_index are *always* 32-bit integers.
896     KMP_MB(); /* is this necessary? */
897     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
898                    "sh->buffer_index:%d\n",
899                    gtid, my_buffer_index, sh->buffer_index));
900 
901     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
902     th->th.th_dispatch->th_dispatch_sh_current =
903         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
904 #if USE_ITT_BUILD
905     if (pr->flags.ordered) {
906       __kmp_itt_ordered_init(gtid);
907     }
908     // Report loop metadata
909     if (itt_need_metadata_reporting) {
910       // Only report metadata by master of active team at level 1
911       kmp_uint64 schedtype = 0;
912       switch (schedule) {
913       case kmp_sch_static_chunked:
914       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
915         break;
916       case kmp_sch_static_greedy:
917         cur_chunk = pr->u.p.parm1;
918         break;
919       case kmp_sch_dynamic_chunked:
920         schedtype = 1;
921         break;
922       case kmp_sch_guided_iterative_chunked:
923       case kmp_sch_guided_analytical_chunked:
924       case kmp_sch_guided_simd:
925         schedtype = 2;
926         break;
927       default:
928         // Should we put this case under "static"?
929         // case kmp_sch_static_steal:
930         schedtype = 3;
931         break;
932       }
933       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
934     }
935 #if KMP_USE_HIER_SCHED
936     if (pr->flags.use_hier) {
937       pr->u.p.count = 0;
938       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
939     }
940 #endif // KMP_USER_HIER_SCHED
941 #endif /* USE_ITT_BUILD */
942   }
943 
944 #ifdef KMP_DEBUG
945   {
946     char *buff;
947     // create format specifiers before the debug output
948     buff = __kmp_str_format(
949         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
950         "lb:%%%s ub:%%%s"
951         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
952         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
953         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
954         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
955         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
956         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
957     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
958                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
959                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
960                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
961     __kmp_str_free(&buff);
962   }
963 #endif
964 #if (KMP_STATIC_STEAL_ENABLED)
965   // It cannot be guaranteed that after execution of a loop with some other
966   // schedule kind all the parm3 variables will contain the same value. Even if
967   // all parm3 will be the same, it still exists a bad case like using 0 and 1
968   // rather than program life-time increment. So the dedicated variable is
969   // required. The 'static_steal_counter' is used.
970   if (schedule == kmp_sch_static_steal) {
971     // Other threads will inspect this variable when searching for a victim.
972     // This is a flag showing that other threads may steal from this thread
973     // since then.
974     volatile T *p = &pr->u.p.static_steal_counter;
975     *p = *p + 1;
976   }
977 #endif // ( KMP_STATIC_STEAL_ENABLED )
978 
979 #if OMPT_SUPPORT && OMPT_OPTIONAL
980   if (ompt_enabled.ompt_callback_work) {
981     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
982     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
983     ompt_callbacks.ompt_callback(ompt_callback_work)(
984         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
985         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
986   }
987 #endif
988   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
989 }
990 
991 /* For ordered loops, either __kmp_dispatch_finish() should be called after
992  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
993  * every chunk of iterations.  If the ordered section(s) were not executed
994  * for this iteration (or every iteration in this chunk), we need to set the
995  * ordered iteration counters so that the next thread can proceed. */
996 template <typename UT>
997 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
998   typedef typename traits_t<UT>::signed_t ST;
999   kmp_info_t *th = __kmp_threads[gtid];
1000 
1001   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1002   if (!th->th.th_team->t.t_serialized) {
1003 
1004     dispatch_private_info_template<UT> *pr =
1005         reinterpret_cast<dispatch_private_info_template<UT> *>(
1006             th->th.th_dispatch->th_dispatch_pr_current);
1007     dispatch_shared_info_template<UT> volatile *sh =
1008         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1009             th->th.th_dispatch->th_dispatch_sh_current);
1010     KMP_DEBUG_ASSERT(pr);
1011     KMP_DEBUG_ASSERT(sh);
1012     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1013                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1014 
1015     if (pr->ordered_bumped) {
1016       KD_TRACE(
1017           1000,
1018           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1019            gtid));
1020       pr->ordered_bumped = 0;
1021     } else {
1022       UT lower = pr->u.p.ordered_lower;
1023 
1024 #ifdef KMP_DEBUG
1025       {
1026         char *buff;
1027         // create format specifiers before the debug output
1028         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1029                                 "ordered_iteration:%%%s lower:%%%s\n",
1030                                 traits_t<UT>::spec, traits_t<UT>::spec);
1031         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1032         __kmp_str_free(&buff);
1033       }
1034 #endif
1035 
1036       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1037                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1038       KMP_MB(); /* is this necessary? */
1039 #ifdef KMP_DEBUG
1040       {
1041         char *buff;
1042         // create format specifiers before the debug output
1043         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1044                                 "ordered_iteration:%%%s lower:%%%s\n",
1045                                 traits_t<UT>::spec, traits_t<UT>::spec);
1046         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1047         __kmp_str_free(&buff);
1048       }
1049 #endif
1050 
1051       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1052     } // if
1053   } // if
1054   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1055 }
1056 
1057 #ifdef KMP_GOMP_COMPAT
1058 
1059 template <typename UT>
1060 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1061   typedef typename traits_t<UT>::signed_t ST;
1062   kmp_info_t *th = __kmp_threads[gtid];
1063 
1064   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1065   if (!th->th.th_team->t.t_serialized) {
1066     //        int cid;
1067     dispatch_private_info_template<UT> *pr =
1068         reinterpret_cast<dispatch_private_info_template<UT> *>(
1069             th->th.th_dispatch->th_dispatch_pr_current);
1070     dispatch_shared_info_template<UT> volatile *sh =
1071         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1072             th->th.th_dispatch->th_dispatch_sh_current);
1073     KMP_DEBUG_ASSERT(pr);
1074     KMP_DEBUG_ASSERT(sh);
1075     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1076                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1077 
1078     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1079     UT lower = pr->u.p.ordered_lower;
1080     UT upper = pr->u.p.ordered_upper;
1081     UT inc = upper - lower + 1;
1082 
1083     if (pr->ordered_bumped == inc) {
1084       KD_TRACE(
1085           1000,
1086           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1087            gtid));
1088       pr->ordered_bumped = 0;
1089     } else {
1090       inc -= pr->ordered_bumped;
1091 
1092 #ifdef KMP_DEBUG
1093       {
1094         char *buff;
1095         // create format specifiers before the debug output
1096         buff = __kmp_str_format(
1097             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1098             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1099             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1100         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1101         __kmp_str_free(&buff);
1102       }
1103 #endif
1104 
1105       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1106                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1107 
1108       KMP_MB(); /* is this necessary? */
1109       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1110                       "ordered_bumped to zero\n",
1111                       gtid));
1112       pr->ordered_bumped = 0;
1113 //!!!!! TODO check if the inc should be unsigned, or signed???
1114 #ifdef KMP_DEBUG
1115       {
1116         char *buff;
1117         // create format specifiers before the debug output
1118         buff = __kmp_str_format(
1119             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1120             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1121             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1122             traits_t<UT>::spec);
1123         KD_TRACE(1000,
1124                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1125         __kmp_str_free(&buff);
1126       }
1127 #endif
1128 
1129       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1130     }
1131     //        }
1132   }
1133   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1134 }
1135 
1136 #endif /* KMP_GOMP_COMPAT */
1137 
1138 template <typename T>
1139 int __kmp_dispatch_next_algorithm(int gtid,
1140                                   dispatch_private_info_template<T> *pr,
1141                                   dispatch_shared_info_template<T> volatile *sh,
1142                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1143                                   typename traits_t<T>::signed_t *p_st, T nproc,
1144                                   T tid) {
1145   typedef typename traits_t<T>::unsigned_t UT;
1146   typedef typename traits_t<T>::signed_t ST;
1147   typedef typename traits_t<T>::floating_t DBL;
1148   int status = 0;
1149   kmp_int32 last = 0;
1150   T start;
1151   ST incr;
1152   UT limit, trip, init;
1153   kmp_info_t *th = __kmp_threads[gtid];
1154   kmp_team_t *team = th->th.th_team;
1155 
1156   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1157                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1158   KMP_DEBUG_ASSERT(pr);
1159   KMP_DEBUG_ASSERT(sh);
1160   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1161 #ifdef KMP_DEBUG
1162   {
1163     char *buff;
1164     // create format specifiers before the debug output
1165     buff =
1166         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1167                          "sh:%%p nproc:%%%s tid:%%%s\n",
1168                          traits_t<T>::spec, traits_t<T>::spec);
1169     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1170     __kmp_str_free(&buff);
1171   }
1172 #endif
1173 
1174   // zero trip count
1175   if (pr->u.p.tc == 0) {
1176     KD_TRACE(10,
1177              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1178               "zero status:%d\n",
1179               gtid, status));
1180     return 0;
1181   }
1182 
1183   switch (pr->schedule) {
1184 #if (KMP_STATIC_STEAL_ENABLED)
1185   case kmp_sch_static_steal: {
1186     T chunk = pr->u.p.parm1;
1187 
1188     KD_TRACE(100,
1189              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1190               gtid));
1191 
1192     trip = pr->u.p.tc - 1;
1193 
1194     if (traits_t<T>::type_size > 4) {
1195       // use lock for 8-byte and CAS for 4-byte induction
1196       // variable. TODO (optional): check and use 16-byte CAS
1197       kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1198       KMP_DEBUG_ASSERT(lck != NULL);
1199       if (pr->u.p.count < (UT)pr->u.p.ub) {
1200         __kmp_acquire_lock(lck, gtid);
1201         // try to get own chunk of iterations
1202         init = (pr->u.p.count)++;
1203         status = (init < (UT)pr->u.p.ub);
1204         __kmp_release_lock(lck, gtid);
1205       } else {
1206         status = 0; // no own chunks
1207       }
1208       if (!status) { // try to steal
1209         kmp_info_t **other_threads = team->t.t_threads;
1210         int while_limit = pr->u.p.parm3;
1211         int while_index = 0;
1212         // TODO: algorithm of searching for a victim
1213         // should be cleaned up and measured
1214         while ((!status) && (while_limit != ++while_index)) {
1215           T remaining;
1216           T victimIdx = pr->u.p.parm4;
1217           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1218           dispatch_private_info_template<T> *victim =
1219               reinterpret_cast<dispatch_private_info_template<T> *>(
1220                   other_threads[victimIdx]
1221                       ->th.th_dispatch->th_dispatch_pr_current);
1222           while ((victim == NULL || victim == pr ||
1223                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1224                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1225                  oldVictimIdx != victimIdx) {
1226             victimIdx = (victimIdx + 1) % nproc;
1227             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1228                 other_threads[victimIdx]
1229                     ->th.th_dispatch->th_dispatch_pr_current);
1230           }
1231           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1232                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1233             continue; // try once more (nproc attempts in total)
1234             // no victim is ready yet to participate in stealing
1235             // because all victims are still in kmp_init_dispatch
1236           }
1237           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1238             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1239             continue; // not enough chunks to steal, goto next victim
1240           }
1241 
1242           lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1243           KMP_ASSERT(lck != NULL);
1244           __kmp_acquire_lock(lck, gtid);
1245           limit = victim->u.p.ub; // keep initial ub
1246           if (victim->u.p.count >= limit ||
1247               (remaining = limit - victim->u.p.count) < 2) {
1248             __kmp_release_lock(lck, gtid);
1249             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1250             continue; // not enough chunks to steal
1251           }
1252           // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1253           // by 1
1254           if (remaining > 3) {
1255             // steal 1/4 of remaining
1256             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1257             init = (victim->u.p.ub -= (remaining >> 2));
1258           } else {
1259             // steal 1 chunk of 2 or 3 remaining
1260             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1261             init = (victim->u.p.ub -= 1);
1262           }
1263           __kmp_release_lock(lck, gtid);
1264 
1265           KMP_DEBUG_ASSERT(init + 1 <= limit);
1266           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1267           status = 1;
1268           while_index = 0;
1269           // now update own count and ub with stolen range but init chunk
1270           __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1271           pr->u.p.count = init + 1;
1272           pr->u.p.ub = limit;
1273           __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1274         } // while (search for victim)
1275       } // if (try to find victim and steal)
1276     } else {
1277       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1278       typedef union {
1279         struct {
1280           UT count;
1281           T ub;
1282         } p;
1283         kmp_int64 b;
1284       } union_i4;
1285       // All operations on 'count' or 'ub' must be combined atomically
1286       // together.
1287       {
1288         union_i4 vold, vnew;
1289         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1290         vnew = vold;
1291         vnew.p.count++;
1292         while (!KMP_COMPARE_AND_STORE_ACQ64(
1293             (volatile kmp_int64 *)&pr->u.p.count,
1294             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1295             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1296           KMP_CPU_PAUSE();
1297           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1298           vnew = vold;
1299           vnew.p.count++;
1300         }
1301         vnew = vold;
1302         init = vnew.p.count;
1303         status = (init < (UT)vnew.p.ub);
1304       }
1305 
1306       if (!status) {
1307         kmp_info_t **other_threads = team->t.t_threads;
1308         int while_limit = pr->u.p.parm3;
1309         int while_index = 0;
1310 
1311         // TODO: algorithm of searching for a victim
1312         // should be cleaned up and measured
1313         while ((!status) && (while_limit != ++while_index)) {
1314           union_i4 vold, vnew;
1315           kmp_int32 remaining;
1316           T victimIdx = pr->u.p.parm4;
1317           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1318           dispatch_private_info_template<T> *victim =
1319               reinterpret_cast<dispatch_private_info_template<T> *>(
1320                   other_threads[victimIdx]
1321                       ->th.th_dispatch->th_dispatch_pr_current);
1322           while ((victim == NULL || victim == pr ||
1323                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1324                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1325                  oldVictimIdx != victimIdx) {
1326             victimIdx = (victimIdx + 1) % nproc;
1327             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1328                 other_threads[victimIdx]
1329                     ->th.th_dispatch->th_dispatch_pr_current);
1330           }
1331           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1332                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1333             continue; // try once more (nproc attempts in total)
1334             // no victim is ready yet to participate in stealing
1335             // because all victims are still in kmp_init_dispatch
1336           }
1337           pr->u.p.parm4 = victimIdx; // new victim found
1338           while (1) { // CAS loop if victim has enough chunks to steal
1339             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1340             vnew = vold;
1341 
1342             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1343             if (vnew.p.count >= (UT)vnew.p.ub ||
1344                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1345               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1346               break; // not enough chunks to steal, goto next victim
1347             }
1348             if (remaining > 3) {
1349               vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1350             } else {
1351               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1352             }
1353             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1354             // TODO: Should this be acquire or release?
1355             if (KMP_COMPARE_AND_STORE_ACQ64(
1356                     (volatile kmp_int64 *)&victim->u.p.count,
1357                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1358                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1359               // stealing succedded
1360               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1361                                         vold.p.ub - vnew.p.ub);
1362               status = 1;
1363               while_index = 0;
1364               // now update own count and ub
1365               init = vnew.p.ub;
1366               vold.p.count = init + 1;
1367 #if KMP_ARCH_X86
1368               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1369 #else
1370               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1371 #endif
1372               break;
1373             } // if (check CAS result)
1374             KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1375           } // while (try to steal from particular victim)
1376         } // while (search for victim)
1377       } // if (try to find victim and steal)
1378     } // if (4-byte induction variable)
1379     if (!status) {
1380       *p_lb = 0;
1381       *p_ub = 0;
1382       if (p_st != NULL)
1383         *p_st = 0;
1384     } else {
1385       start = pr->u.p.parm2;
1386       init *= chunk;
1387       limit = chunk + init - 1;
1388       incr = pr->u.p.st;
1389       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1390 
1391       KMP_DEBUG_ASSERT(init <= trip);
1392       if ((last = (limit >= trip)) != 0)
1393         limit = trip;
1394       if (p_st != NULL)
1395         *p_st = incr;
1396 
1397       if (incr == 1) {
1398         *p_lb = start + init;
1399         *p_ub = start + limit;
1400       } else {
1401         *p_lb = start + init * incr;
1402         *p_ub = start + limit * incr;
1403       }
1404 
1405       if (pr->flags.ordered) {
1406         pr->u.p.ordered_lower = init;
1407         pr->u.p.ordered_upper = limit;
1408       } // if
1409     } // if
1410     break;
1411   } // case
1412 #endif // ( KMP_STATIC_STEAL_ENABLED )
1413   case kmp_sch_static_balanced: {
1414     KD_TRACE(
1415         10,
1416         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1417          gtid));
1418     /* check if thread has any iteration to do */
1419     if ((status = !pr->u.p.count) != 0) {
1420       pr->u.p.count = 1;
1421       *p_lb = pr->u.p.lb;
1422       *p_ub = pr->u.p.ub;
1423       last = pr->u.p.parm1;
1424       if (p_st != NULL)
1425         *p_st = pr->u.p.st;
1426     } else { /* no iterations to do */
1427       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1428     }
1429   } // case
1430   break;
1431   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1432                                  merged here */
1433   case kmp_sch_static_chunked: {
1434     T parm1;
1435 
1436     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1437                    "kmp_sch_static_[affinity|chunked] case\n",
1438                    gtid));
1439     parm1 = pr->u.p.parm1;
1440 
1441     trip = pr->u.p.tc - 1;
1442     init = parm1 * (pr->u.p.count + tid);
1443 
1444     if ((status = (init <= trip)) != 0) {
1445       start = pr->u.p.lb;
1446       incr = pr->u.p.st;
1447       limit = parm1 + init - 1;
1448 
1449       if ((last = (limit >= trip)) != 0)
1450         limit = trip;
1451 
1452       if (p_st != NULL)
1453         *p_st = incr;
1454 
1455       pr->u.p.count += nproc;
1456 
1457       if (incr == 1) {
1458         *p_lb = start + init;
1459         *p_ub = start + limit;
1460       } else {
1461         *p_lb = start + init * incr;
1462         *p_ub = start + limit * incr;
1463       }
1464 
1465       if (pr->flags.ordered) {
1466         pr->u.p.ordered_lower = init;
1467         pr->u.p.ordered_upper = limit;
1468       } // if
1469     } // if
1470   } // case
1471   break;
1472 
1473   case kmp_sch_dynamic_chunked: {
1474     T chunk = pr->u.p.parm1;
1475 
1476     KD_TRACE(
1477         100,
1478         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1479          gtid));
1480 
1481     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1482     trip = pr->u.p.tc - 1;
1483 
1484     if ((status = (init <= trip)) == 0) {
1485       *p_lb = 0;
1486       *p_ub = 0;
1487       if (p_st != NULL)
1488         *p_st = 0;
1489     } else {
1490       start = pr->u.p.lb;
1491       limit = chunk + init - 1;
1492       incr = pr->u.p.st;
1493 
1494       if ((last = (limit >= trip)) != 0)
1495         limit = trip;
1496 
1497       if (p_st != NULL)
1498         *p_st = incr;
1499 
1500       if (incr == 1) {
1501         *p_lb = start + init;
1502         *p_ub = start + limit;
1503       } else {
1504         *p_lb = start + init * incr;
1505         *p_ub = start + limit * incr;
1506       }
1507 
1508       if (pr->flags.ordered) {
1509         pr->u.p.ordered_lower = init;
1510         pr->u.p.ordered_upper = limit;
1511       } // if
1512     } // if
1513   } // case
1514   break;
1515 
1516   case kmp_sch_guided_iterative_chunked: {
1517     T chunkspec = pr->u.p.parm1;
1518     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1519                    "iterative case\n",
1520                    gtid));
1521     trip = pr->u.p.tc;
1522     // Start atomic part of calculations
1523     while (1) {
1524       ST remaining; // signed, because can be < 0
1525       init = sh->u.s.iteration; // shared value
1526       remaining = trip - init;
1527       if (remaining <= 0) { // AC: need to compare with 0 first
1528         // nothing to do, don't try atomic op
1529         status = 0;
1530         break;
1531       }
1532       if ((T)remaining <
1533           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1534         // use dynamic-style shcedule
1535         // atomically inrement iterations, get old value
1536         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1537                                  (ST)chunkspec);
1538         remaining = trip - init;
1539         if (remaining <= 0) {
1540           status = 0; // all iterations got by other threads
1541         } else {
1542           // got some iterations to work on
1543           status = 1;
1544           if ((T)remaining > chunkspec) {
1545             limit = init + chunkspec - 1;
1546           } else {
1547             last = 1; // the last chunk
1548             limit = init + remaining - 1;
1549           } // if
1550         } // if
1551         break;
1552       } // if
1553       limit = init +
1554               (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1555       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1556                                (ST)init, (ST)limit)) {
1557         // CAS was successful, chunk obtained
1558         status = 1;
1559         --limit;
1560         break;
1561       } // if
1562     } // while
1563     if (status != 0) {
1564       start = pr->u.p.lb;
1565       incr = pr->u.p.st;
1566       if (p_st != NULL)
1567         *p_st = incr;
1568       *p_lb = start + init * incr;
1569       *p_ub = start + limit * incr;
1570       if (pr->flags.ordered) {
1571         pr->u.p.ordered_lower = init;
1572         pr->u.p.ordered_upper = limit;
1573       } // if
1574     } else {
1575       *p_lb = 0;
1576       *p_ub = 0;
1577       if (p_st != NULL)
1578         *p_st = 0;
1579     } // if
1580   } // case
1581   break;
1582 
1583   case kmp_sch_guided_simd: {
1584     // same as iterative but curr-chunk adjusted to be multiple of given
1585     // chunk
1586     T chunk = pr->u.p.parm1;
1587     KD_TRACE(100,
1588              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1589               gtid));
1590     trip = pr->u.p.tc;
1591     // Start atomic part of calculations
1592     while (1) {
1593       ST remaining; // signed, because can be < 0
1594       init = sh->u.s.iteration; // shared value
1595       remaining = trip - init;
1596       if (remaining <= 0) { // AC: need to compare with 0 first
1597         status = 0; // nothing to do, don't try atomic op
1598         break;
1599       }
1600       KMP_DEBUG_ASSERT(init % chunk == 0);
1601       // compare with K*nproc*(chunk+1), K=2 by default
1602       if ((T)remaining < pr->u.p.parm2) {
1603         // use dynamic-style shcedule
1604         // atomically inrement iterations, get old value
1605         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1606                                  (ST)chunk);
1607         remaining = trip - init;
1608         if (remaining <= 0) {
1609           status = 0; // all iterations got by other threads
1610         } else {
1611           // got some iterations to work on
1612           status = 1;
1613           if ((T)remaining > chunk) {
1614             limit = init + chunk - 1;
1615           } else {
1616             last = 1; // the last chunk
1617             limit = init + remaining - 1;
1618           } // if
1619         } // if
1620         break;
1621       } // if
1622       // divide by K*nproc
1623       UT span = remaining * (*(double *)&pr->u.p.parm3);
1624       UT rem = span % chunk;
1625       if (rem) // adjust so that span%chunk == 0
1626         span += chunk - rem;
1627       limit = init + span;
1628       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1629                                (ST)init, (ST)limit)) {
1630         // CAS was successful, chunk obtained
1631         status = 1;
1632         --limit;
1633         break;
1634       } // if
1635     } // while
1636     if (status != 0) {
1637       start = pr->u.p.lb;
1638       incr = pr->u.p.st;
1639       if (p_st != NULL)
1640         *p_st = incr;
1641       *p_lb = start + init * incr;
1642       *p_ub = start + limit * incr;
1643       if (pr->flags.ordered) {
1644         pr->u.p.ordered_lower = init;
1645         pr->u.p.ordered_upper = limit;
1646       } // if
1647     } else {
1648       *p_lb = 0;
1649       *p_ub = 0;
1650       if (p_st != NULL)
1651         *p_st = 0;
1652     } // if
1653   } // case
1654   break;
1655 
1656   case kmp_sch_guided_analytical_chunked: {
1657     T chunkspec = pr->u.p.parm1;
1658     UT chunkIdx;
1659 #if KMP_USE_X87CONTROL
1660     /* for storing original FPCW value for Windows* OS on
1661        IA-32 architecture 8-byte version */
1662     unsigned int oldFpcw;
1663     unsigned int fpcwSet = 0;
1664 #endif
1665     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1666                    "kmp_sch_guided_analytical_chunked case\n",
1667                    gtid));
1668 
1669     trip = pr->u.p.tc;
1670 
1671     KMP_DEBUG_ASSERT(nproc > 1);
1672     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1673 
1674     while (1) { /* this while loop is a safeguard against unexpected zero
1675                    chunk sizes */
1676       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1677       if (chunkIdx >= (UT)pr->u.p.parm2) {
1678         --trip;
1679         /* use dynamic-style scheduling */
1680         init = chunkIdx * chunkspec + pr->u.p.count;
1681         /* need to verify init > 0 in case of overflow in the above
1682          * calculation */
1683         if ((status = (init > 0 && init <= trip)) != 0) {
1684           limit = init + chunkspec - 1;
1685 
1686           if ((last = (limit >= trip)) != 0)
1687             limit = trip;
1688         }
1689         break;
1690       } else {
1691 /* use exponential-style scheduling */
1692 /* The following check is to workaround the lack of long double precision on
1693    Windows* OS.
1694    This check works around the possible effect that init != 0 for chunkIdx == 0.
1695  */
1696 #if KMP_USE_X87CONTROL
1697         /* If we haven't already done so, save original
1698            FPCW and set precision to 64-bit, as Windows* OS
1699            on IA-32 architecture defaults to 53-bit */
1700         if (!fpcwSet) {
1701           oldFpcw = _control87(0, 0);
1702           _control87(_PC_64, _MCW_PC);
1703           fpcwSet = 0x30000;
1704         }
1705 #endif
1706         if (chunkIdx) {
1707           init = __kmp_dispatch_guided_remaining<T>(
1708               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1709           KMP_DEBUG_ASSERT(init);
1710           init = trip - init;
1711         } else
1712           init = 0;
1713         limit = trip - __kmp_dispatch_guided_remaining<T>(
1714                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1715         KMP_ASSERT(init <= limit);
1716         if (init < limit) {
1717           KMP_DEBUG_ASSERT(limit <= trip);
1718           --limit;
1719           status = 1;
1720           break;
1721         } // if
1722       } // if
1723     } // while (1)
1724 #if KMP_USE_X87CONTROL
1725     /* restore FPCW if necessary
1726        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1727     */
1728     if (fpcwSet && (oldFpcw & fpcwSet))
1729       _control87(oldFpcw, _MCW_PC);
1730 #endif
1731     if (status != 0) {
1732       start = pr->u.p.lb;
1733       incr = pr->u.p.st;
1734       if (p_st != NULL)
1735         *p_st = incr;
1736       *p_lb = start + init * incr;
1737       *p_ub = start + limit * incr;
1738       if (pr->flags.ordered) {
1739         pr->u.p.ordered_lower = init;
1740         pr->u.p.ordered_upper = limit;
1741       }
1742     } else {
1743       *p_lb = 0;
1744       *p_ub = 0;
1745       if (p_st != NULL)
1746         *p_st = 0;
1747     }
1748   } // case
1749   break;
1750 
1751   case kmp_sch_trapezoidal: {
1752     UT index;
1753     T parm2 = pr->u.p.parm2;
1754     T parm3 = pr->u.p.parm3;
1755     T parm4 = pr->u.p.parm4;
1756     KD_TRACE(100,
1757              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1758               gtid));
1759 
1760     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1761 
1762     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1763     trip = pr->u.p.tc - 1;
1764 
1765     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1766       *p_lb = 0;
1767       *p_ub = 0;
1768       if (p_st != NULL)
1769         *p_st = 0;
1770     } else {
1771       start = pr->u.p.lb;
1772       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1773       incr = pr->u.p.st;
1774 
1775       if ((last = (limit >= trip)) != 0)
1776         limit = trip;
1777 
1778       if (p_st != NULL)
1779         *p_st = incr;
1780 
1781       if (incr == 1) {
1782         *p_lb = start + init;
1783         *p_ub = start + limit;
1784       } else {
1785         *p_lb = start + init * incr;
1786         *p_ub = start + limit * incr;
1787       }
1788 
1789       if (pr->flags.ordered) {
1790         pr->u.p.ordered_lower = init;
1791         pr->u.p.ordered_upper = limit;
1792       } // if
1793     } // if
1794   } // case
1795   break;
1796   default: {
1797     status = 0; // to avoid complaints on uninitialized variable use
1798     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1799                 KMP_HNT(GetNewerLibrary), // Hint
1800                 __kmp_msg_null // Variadic argument list terminator
1801                 );
1802   } break;
1803   } // switch
1804   if (p_last)
1805     *p_last = last;
1806 #ifdef KMP_DEBUG
1807   if (pr->flags.ordered) {
1808     char *buff;
1809     // create format specifiers before the debug output
1810     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1811                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1812                             traits_t<UT>::spec, traits_t<UT>::spec);
1813     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1814     __kmp_str_free(&buff);
1815   }
1816   {
1817     char *buff;
1818     // create format specifiers before the debug output
1819     buff = __kmp_str_format(
1820         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1821         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1822         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1823     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1824     __kmp_str_free(&buff);
1825   }
1826 #endif
1827   return status;
1828 }
1829 
1830 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1831    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1832    is not called. */
1833 #if OMPT_SUPPORT && OMPT_OPTIONAL
1834 #define OMPT_LOOP_END                                                          \
1835   if (status == 0) {                                                           \
1836     if (ompt_enabled.ompt_callback_work) {                                     \
1837       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1838       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1839       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1840           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1841           &(task_info->task_data), 0, codeptr);                                \
1842     }                                                                          \
1843   }
1844 // TODO: implement count
1845 #else
1846 #define OMPT_LOOP_END // no-op
1847 #endif
1848 
1849 #if KMP_STATS_ENABLED
1850 #define KMP_STATS_LOOP_END                                                     \
1851   {                                                                            \
1852     kmp_int64 u, l, t, i;                                                      \
1853     l = (kmp_int64)(*p_lb);                                                    \
1854     u = (kmp_int64)(*p_ub);                                                    \
1855     i = (kmp_int64)(pr->u.p.st);                                               \
1856     if (status == 0) {                                                         \
1857       t = 0;                                                                   \
1858       KMP_POP_PARTITIONED_TIMER();                                             \
1859     } else if (i == 1) {                                                       \
1860       if (u >= l)                                                              \
1861         t = u - l + 1;                                                         \
1862       else                                                                     \
1863         t = 0;                                                                 \
1864     } else if (i < 0) {                                                        \
1865       if (l >= u)                                                              \
1866         t = (l - u) / (-i) + 1;                                                \
1867       else                                                                     \
1868         t = 0;                                                                 \
1869     } else {                                                                   \
1870       if (u >= l)                                                              \
1871         t = (u - l) / i + 1;                                                   \
1872       else                                                                     \
1873         t = 0;                                                                 \
1874     }                                                                          \
1875     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1876   }
1877 #else
1878 #define KMP_STATS_LOOP_END /* Nothing */
1879 #endif
1880 
1881 template <typename T>
1882 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1883                                T *p_lb, T *p_ub,
1884                                typename traits_t<T>::signed_t *p_st
1885 #if OMPT_SUPPORT && OMPT_OPTIONAL
1886                                ,
1887                                void *codeptr
1888 #endif
1889                                ) {
1890 
1891   typedef typename traits_t<T>::unsigned_t UT;
1892   typedef typename traits_t<T>::signed_t ST;
1893   // This is potentially slightly misleading, schedule(runtime) will appear here
1894   // even if the actual runtme schedule is static. (Which points out a
1895   // disadavantage of schedule(runtime): even when static scheduling is used it
1896   // costs more than a compile time choice to use static scheduling would.)
1897   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1898 
1899   int status;
1900   dispatch_private_info_template<T> *pr;
1901   kmp_info_t *th = __kmp_threads[gtid];
1902   kmp_team_t *team = th->th.th_team;
1903 
1904   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1905   KD_TRACE(
1906       1000,
1907       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1908        gtid, p_lb, p_ub, p_st, p_last));
1909 
1910   if (team->t.t_serialized) {
1911     /* NOTE: serialize this dispatch becase we are not at the active level */
1912     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1913         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1914     KMP_DEBUG_ASSERT(pr);
1915 
1916     if ((status = (pr->u.p.tc != 0)) == 0) {
1917       *p_lb = 0;
1918       *p_ub = 0;
1919       //            if ( p_last != NULL )
1920       //                *p_last = 0;
1921       if (p_st != NULL)
1922         *p_st = 0;
1923       if (__kmp_env_consistency_check) {
1924         if (pr->pushed_ws != ct_none) {
1925           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1926         }
1927       }
1928     } else if (pr->flags.nomerge) {
1929       kmp_int32 last;
1930       T start;
1931       UT limit, trip, init;
1932       ST incr;
1933       T chunk = pr->u.p.parm1;
1934 
1935       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1936                      gtid));
1937 
1938       init = chunk * pr->u.p.count++;
1939       trip = pr->u.p.tc - 1;
1940 
1941       if ((status = (init <= trip)) == 0) {
1942         *p_lb = 0;
1943         *p_ub = 0;
1944         //                if ( p_last != NULL )
1945         //                    *p_last = 0;
1946         if (p_st != NULL)
1947           *p_st = 0;
1948         if (__kmp_env_consistency_check) {
1949           if (pr->pushed_ws != ct_none) {
1950             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1951           }
1952         }
1953       } else {
1954         start = pr->u.p.lb;
1955         limit = chunk + init - 1;
1956         incr = pr->u.p.st;
1957 
1958         if ((last = (limit >= trip)) != 0) {
1959           limit = trip;
1960 #if KMP_OS_WINDOWS
1961           pr->u.p.last_upper = pr->u.p.ub;
1962 #endif /* KMP_OS_WINDOWS */
1963         }
1964         if (p_last != NULL)
1965           *p_last = last;
1966         if (p_st != NULL)
1967           *p_st = incr;
1968         if (incr == 1) {
1969           *p_lb = start + init;
1970           *p_ub = start + limit;
1971         } else {
1972           *p_lb = start + init * incr;
1973           *p_ub = start + limit * incr;
1974         }
1975 
1976         if (pr->flags.ordered) {
1977           pr->u.p.ordered_lower = init;
1978           pr->u.p.ordered_upper = limit;
1979 #ifdef KMP_DEBUG
1980           {
1981             char *buff;
1982             // create format specifiers before the debug output
1983             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1984                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1985                                     traits_t<UT>::spec, traits_t<UT>::spec);
1986             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1987                             pr->u.p.ordered_upper));
1988             __kmp_str_free(&buff);
1989           }
1990 #endif
1991         } // if
1992       } // if
1993     } else {
1994       pr->u.p.tc = 0;
1995       *p_lb = pr->u.p.lb;
1996       *p_ub = pr->u.p.ub;
1997 #if KMP_OS_WINDOWS
1998       pr->u.p.last_upper = *p_ub;
1999 #endif /* KMP_OS_WINDOWS */
2000       if (p_last != NULL)
2001         *p_last = TRUE;
2002       if (p_st != NULL)
2003         *p_st = pr->u.p.st;
2004     } // if
2005 #ifdef KMP_DEBUG
2006     {
2007       char *buff;
2008       // create format specifiers before the debug output
2009       buff = __kmp_str_format(
2010           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2011           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2012           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2013       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
2014       __kmp_str_free(&buff);
2015     }
2016 #endif
2017 #if INCLUDE_SSC_MARKS
2018     SSC_MARK_DISPATCH_NEXT();
2019 #endif
2020     OMPT_LOOP_END;
2021     KMP_STATS_LOOP_END;
2022     return status;
2023   } else {
2024     kmp_int32 last = 0;
2025     dispatch_shared_info_template<T> volatile *sh;
2026 
2027     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2028                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2029 
2030     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2031         th->th.th_dispatch->th_dispatch_pr_current);
2032     KMP_DEBUG_ASSERT(pr);
2033     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2034         th->th.th_dispatch->th_dispatch_sh_current);
2035     KMP_DEBUG_ASSERT(sh);
2036 
2037 #if KMP_USE_HIER_SCHED
2038     if (pr->flags.use_hier)
2039       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2040     else
2041 #endif // KMP_USE_HIER_SCHED
2042       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2043                                                 p_st, th->th.th_team_nproc,
2044                                                 th->th.th_info.ds.ds_tid);
2045     // status == 0: no more iterations to execute
2046     if (status == 0) {
2047       UT num_done;
2048 
2049       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2050 #ifdef KMP_DEBUG
2051       {
2052         char *buff;
2053         // create format specifiers before the debug output
2054         buff = __kmp_str_format(
2055             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2056             traits_t<UT>::spec);
2057         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2058         __kmp_str_free(&buff);
2059       }
2060 #endif
2061 
2062 #if KMP_USE_HIER_SCHED
2063       pr->flags.use_hier = FALSE;
2064 #endif
2065       if ((ST)num_done == th->th.th_team_nproc - 1) {
2066 #if (KMP_STATIC_STEAL_ENABLED)
2067         if (pr->schedule == kmp_sch_static_steal &&
2068             traits_t<T>::type_size > 4) {
2069           int i;
2070           kmp_info_t **other_threads = team->t.t_threads;
2071           // loop complete, safe to destroy locks used for stealing
2072           for (i = 0; i < th->th.th_team_nproc; ++i) {
2073             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2074             KMP_ASSERT(lck != NULL);
2075             __kmp_destroy_lock(lck);
2076             __kmp_free(lck);
2077             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2078           }
2079         }
2080 #endif
2081         /* NOTE: release this buffer to be reused */
2082 
2083         KMP_MB(); /* Flush all pending memory write invalidates.  */
2084 
2085         sh->u.s.num_done = 0;
2086         sh->u.s.iteration = 0;
2087 
2088         /* TODO replace with general release procedure? */
2089         if (pr->flags.ordered) {
2090           sh->u.s.ordered_iteration = 0;
2091         }
2092 
2093         KMP_MB(); /* Flush all pending memory write invalidates.  */
2094 
2095         sh->buffer_index += __kmp_dispatch_num_buffers;
2096         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2097                        gtid, sh->buffer_index));
2098 
2099         KMP_MB(); /* Flush all pending memory write invalidates.  */
2100 
2101       } // if
2102       if (__kmp_env_consistency_check) {
2103         if (pr->pushed_ws != ct_none) {
2104           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2105         }
2106       }
2107 
2108       th->th.th_dispatch->th_deo_fcn = NULL;
2109       th->th.th_dispatch->th_dxo_fcn = NULL;
2110       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2111       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2112     } // if (status == 0)
2113 #if KMP_OS_WINDOWS
2114     else if (last) {
2115       pr->u.p.last_upper = pr->u.p.ub;
2116     }
2117 #endif /* KMP_OS_WINDOWS */
2118     if (p_last != NULL && status != 0)
2119       *p_last = last;
2120   } // if
2121 
2122 #ifdef KMP_DEBUG
2123   {
2124     char *buff;
2125     // create format specifiers before the debug output
2126     buff = __kmp_str_format(
2127         "__kmp_dispatch_next: T#%%d normal case: "
2128         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2129         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2130     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2131                   (p_last ? *p_last : 0), status));
2132     __kmp_str_free(&buff);
2133   }
2134 #endif
2135 #if INCLUDE_SSC_MARKS
2136   SSC_MARK_DISPATCH_NEXT();
2137 #endif
2138   OMPT_LOOP_END;
2139   KMP_STATS_LOOP_END;
2140   return status;
2141 }
2142 
2143 template <typename T>
2144 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2145                                   kmp_int32 *plastiter, T *plower, T *pupper,
2146                                   typename traits_t<T>::signed_t incr) {
2147   typedef typename traits_t<T>::unsigned_t UT;
2148   kmp_uint32 team_id;
2149   kmp_uint32 nteams;
2150   UT trip_count;
2151   kmp_team_t *team;
2152   kmp_info_t *th;
2153 
2154   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2155   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2156 #ifdef KMP_DEBUG
2157   typedef typename traits_t<T>::signed_t ST;
2158   {
2159     char *buff;
2160     // create format specifiers before the debug output
2161     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2162                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2163                             traits_t<T>::spec, traits_t<T>::spec,
2164                             traits_t<ST>::spec, traits_t<T>::spec);
2165     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2166     __kmp_str_free(&buff);
2167   }
2168 #endif
2169 
2170   if (__kmp_env_consistency_check) {
2171     if (incr == 0) {
2172       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2173                             loc);
2174     }
2175     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2176       // The loop is illegal.
2177       // Some zero-trip loops maintained by compiler, e.g.:
2178       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2179       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2180       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2181       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2182       // Compiler does not check the following illegal loops:
2183       //   for(i=0;i<10;i+=incr) // where incr<0
2184       //   for(i=10;i>0;i-=incr) // where incr<0
2185       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2186     }
2187   }
2188   th = __kmp_threads[gtid];
2189   team = th->th.th_team;
2190   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2191   nteams = th->th.th_teams_size.nteams;
2192   team_id = team->t.t_master_tid;
2193   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2194 
2195   // compute global trip count
2196   if (incr == 1) {
2197     trip_count = *pupper - *plower + 1;
2198   } else if (incr == -1) {
2199     trip_count = *plower - *pupper + 1;
2200   } else if (incr > 0) {
2201     // upper-lower can exceed the limit of signed type
2202     trip_count = (UT)(*pupper - *plower) / incr + 1;
2203   } else {
2204     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2205   }
2206 
2207   if (trip_count <= nteams) {
2208     KMP_DEBUG_ASSERT(
2209         __kmp_static == kmp_sch_static_greedy ||
2210         __kmp_static ==
2211             kmp_sch_static_balanced); // Unknown static scheduling type.
2212     // only some teams get single iteration, others get nothing
2213     if (team_id < trip_count) {
2214       *pupper = *plower = *plower + team_id * incr;
2215     } else {
2216       *plower = *pupper + incr; // zero-trip loop
2217     }
2218     if (plastiter != NULL)
2219       *plastiter = (team_id == trip_count - 1);
2220   } else {
2221     if (__kmp_static == kmp_sch_static_balanced) {
2222       UT chunk = trip_count / nteams;
2223       UT extras = trip_count % nteams;
2224       *plower +=
2225           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2226       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2227       if (plastiter != NULL)
2228         *plastiter = (team_id == nteams - 1);
2229     } else {
2230       T chunk_inc_count =
2231           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2232       T upper = *pupper;
2233       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2234       // Unknown static scheduling type.
2235       *plower += team_id * chunk_inc_count;
2236       *pupper = *plower + chunk_inc_count - incr;
2237       // Check/correct bounds if needed
2238       if (incr > 0) {
2239         if (*pupper < *plower)
2240           *pupper = traits_t<T>::max_value;
2241         if (plastiter != NULL)
2242           *plastiter = *plower <= upper && *pupper > upper - incr;
2243         if (*pupper > upper)
2244           *pupper = upper; // tracker C73258
2245       } else {
2246         if (*pupper > *plower)
2247           *pupper = traits_t<T>::min_value;
2248         if (plastiter != NULL)
2249           *plastiter = *plower >= upper && *pupper < upper - incr;
2250         if (*pupper < upper)
2251           *pupper = upper; // tracker C73258
2252       }
2253     }
2254   }
2255 }
2256 
2257 //-----------------------------------------------------------------------------
2258 // Dispatch routines
2259 //    Transfer call to template< type T >
2260 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2261 //                         T lb, T ub, ST st, ST chunk )
2262 extern "C" {
2263 
2264 /*!
2265 @ingroup WORK_SHARING
2266 @{
2267 @param loc Source location
2268 @param gtid Global thread id
2269 @param schedule Schedule type
2270 @param lb  Lower bound
2271 @param ub  Upper bound
2272 @param st  Step (or increment if you prefer)
2273 @param chunk The chunk size to block with
2274 
2275 This function prepares the runtime to start a dynamically scheduled for loop,
2276 saving the loop arguments.
2277 These functions are all identical apart from the types of the arguments.
2278 */
2279 
2280 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2281                             enum sched_type schedule, kmp_int32 lb,
2282                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2283   KMP_DEBUG_ASSERT(__kmp_init_serial);
2284 #if OMPT_SUPPORT && OMPT_OPTIONAL
2285   OMPT_STORE_RETURN_ADDRESS(gtid);
2286 #endif
2287   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2288 }
2289 /*!
2290 See @ref __kmpc_dispatch_init_4
2291 */
2292 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2293                              enum sched_type schedule, kmp_uint32 lb,
2294                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2295   KMP_DEBUG_ASSERT(__kmp_init_serial);
2296 #if OMPT_SUPPORT && OMPT_OPTIONAL
2297   OMPT_STORE_RETURN_ADDRESS(gtid);
2298 #endif
2299   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2300 }
2301 
2302 /*!
2303 See @ref __kmpc_dispatch_init_4
2304 */
2305 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2306                             enum sched_type schedule, kmp_int64 lb,
2307                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2308   KMP_DEBUG_ASSERT(__kmp_init_serial);
2309 #if OMPT_SUPPORT && OMPT_OPTIONAL
2310   OMPT_STORE_RETURN_ADDRESS(gtid);
2311 #endif
2312   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2313 }
2314 
2315 /*!
2316 See @ref __kmpc_dispatch_init_4
2317 */
2318 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2319                              enum sched_type schedule, kmp_uint64 lb,
2320                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2321   KMP_DEBUG_ASSERT(__kmp_init_serial);
2322 #if OMPT_SUPPORT && OMPT_OPTIONAL
2323   OMPT_STORE_RETURN_ADDRESS(gtid);
2324 #endif
2325   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2326 }
2327 
2328 /*!
2329 See @ref __kmpc_dispatch_init_4
2330 
2331 Difference from __kmpc_dispatch_init set of functions is these functions
2332 are called for composite distribute parallel for construct. Thus before
2333 regular iterations dispatching we need to calc per-team iteration space.
2334 
2335 These functions are all identical apart from the types of the arguments.
2336 */
2337 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2338                                  enum sched_type schedule, kmp_int32 *p_last,
2339                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2340                                  kmp_int32 chunk) {
2341   KMP_DEBUG_ASSERT(__kmp_init_serial);
2342 #if OMPT_SUPPORT && OMPT_OPTIONAL
2343   OMPT_STORE_RETURN_ADDRESS(gtid);
2344 #endif
2345   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2346   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2347 }
2348 
2349 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2350                                   enum sched_type schedule, kmp_int32 *p_last,
2351                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2352                                   kmp_int32 chunk) {
2353   KMP_DEBUG_ASSERT(__kmp_init_serial);
2354 #if OMPT_SUPPORT && OMPT_OPTIONAL
2355   OMPT_STORE_RETURN_ADDRESS(gtid);
2356 #endif
2357   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2358   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2359 }
2360 
2361 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2362                                  enum sched_type schedule, kmp_int32 *p_last,
2363                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2364                                  kmp_int64 chunk) {
2365   KMP_DEBUG_ASSERT(__kmp_init_serial);
2366 #if OMPT_SUPPORT && OMPT_OPTIONAL
2367   OMPT_STORE_RETURN_ADDRESS(gtid);
2368 #endif
2369   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2370   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2371 }
2372 
2373 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2374                                   enum sched_type schedule, kmp_int32 *p_last,
2375                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2376                                   kmp_int64 chunk) {
2377   KMP_DEBUG_ASSERT(__kmp_init_serial);
2378 #if OMPT_SUPPORT && OMPT_OPTIONAL
2379   OMPT_STORE_RETURN_ADDRESS(gtid);
2380 #endif
2381   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2382   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2383 }
2384 
2385 /*!
2386 @param loc Source code location
2387 @param gtid Global thread id
2388 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2389 otherwise
2390 @param p_lb   Pointer to the lower bound for the next chunk of work
2391 @param p_ub   Pointer to the upper bound for the next chunk of work
2392 @param p_st   Pointer to the stride for the next chunk of work
2393 @return one if there is work to be done, zero otherwise
2394 
2395 Get the next dynamically allocated chunk of work for this thread.
2396 If there is no more work, then the lb,ub and stride need not be modified.
2397 */
2398 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2399                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2400 #if OMPT_SUPPORT && OMPT_OPTIONAL
2401   OMPT_STORE_RETURN_ADDRESS(gtid);
2402 #endif
2403   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2404 #if OMPT_SUPPORT && OMPT_OPTIONAL
2405                                         ,
2406                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2407 #endif
2408                                             );
2409 }
2410 
2411 /*!
2412 See @ref __kmpc_dispatch_next_4
2413 */
2414 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2415                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2416                             kmp_int32 *p_st) {
2417 #if OMPT_SUPPORT && OMPT_OPTIONAL
2418   OMPT_STORE_RETURN_ADDRESS(gtid);
2419 #endif
2420   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2421 #if OMPT_SUPPORT && OMPT_OPTIONAL
2422                                          ,
2423                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2424 #endif
2425                                              );
2426 }
2427 
2428 /*!
2429 See @ref __kmpc_dispatch_next_4
2430 */
2431 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2432                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2433 #if OMPT_SUPPORT && OMPT_OPTIONAL
2434   OMPT_STORE_RETURN_ADDRESS(gtid);
2435 #endif
2436   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2437 #if OMPT_SUPPORT && OMPT_OPTIONAL
2438                                         ,
2439                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2440 #endif
2441                                             );
2442 }
2443 
2444 /*!
2445 See @ref __kmpc_dispatch_next_4
2446 */
2447 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2448                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2449                             kmp_int64 *p_st) {
2450 #if OMPT_SUPPORT && OMPT_OPTIONAL
2451   OMPT_STORE_RETURN_ADDRESS(gtid);
2452 #endif
2453   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2454 #if OMPT_SUPPORT && OMPT_OPTIONAL
2455                                          ,
2456                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2457 #endif
2458                                              );
2459 }
2460 
2461 /*!
2462 @param loc Source code location
2463 @param gtid Global thread id
2464 
2465 Mark the end of a dynamic loop.
2466 */
2467 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2468   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2469 }
2470 
2471 /*!
2472 See @ref __kmpc_dispatch_fini_4
2473 */
2474 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2475   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2476 }
2477 
2478 /*!
2479 See @ref __kmpc_dispatch_fini_4
2480 */
2481 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2482   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2483 }
2484 
2485 /*!
2486 See @ref __kmpc_dispatch_fini_4
2487 */
2488 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2489   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2490 }
2491 /*! @} */
2492 
2493 //-----------------------------------------------------------------------------
2494 // Non-template routines from kmp_dispatch.cpp used in other sources
2495 
2496 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2497   return value == checker;
2498 }
2499 
2500 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2501   return value != checker;
2502 }
2503 
2504 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2505   return value < checker;
2506 }
2507 
2508 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2509   return value >= checker;
2510 }
2511 
2512 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2513   return value <= checker;
2514 }
2515 
2516 kmp_uint32
2517 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2518              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2519              void *obj // Higher-level synchronization object, or NULL.
2520              ) {
2521   // note: we may not belong to a team at this point
2522   volatile kmp_uint32 *spin = spinner;
2523   kmp_uint32 check = checker;
2524   kmp_uint32 spins;
2525   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2526   kmp_uint32 r;
2527 
2528   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2529   KMP_INIT_YIELD(spins);
2530   // main wait spin loop
2531   while (!f(r = TCR_4(*spin), check)) {
2532     KMP_FSYNC_SPIN_PREPARE(obj);
2533     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2534        split. It causes problems with infinite recursion because of exit lock */
2535     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2536         __kmp_abort_thread(); */
2537     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2538   }
2539   KMP_FSYNC_SPIN_ACQUIRED(obj);
2540   return r;
2541 }
2542 
2543 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2544                       kmp_uint32 (*pred)(void *, kmp_uint32),
2545                       void *obj // Higher-level synchronization object, or NULL.
2546                       ) {
2547   // note: we may not belong to a team at this point
2548   void *spin = spinner;
2549   kmp_uint32 check = checker;
2550   kmp_uint32 spins;
2551   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2552 
2553   KMP_FSYNC_SPIN_INIT(obj, spin);
2554   KMP_INIT_YIELD(spins);
2555   // main wait spin loop
2556   while (!f(spin, check)) {
2557     KMP_FSYNC_SPIN_PREPARE(obj);
2558     /* if we have waited a bit, or are noversubscribed, yield */
2559     /* pause is in the following code */
2560     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2561   }
2562   KMP_FSYNC_SPIN_ACQUIRED(obj);
2563 }
2564 
2565 } // extern "C"
2566 
2567 #ifdef KMP_GOMP_COMPAT
2568 
2569 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2570                                enum sched_type schedule, kmp_int32 lb,
2571                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2572                                int push_ws) {
2573   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2574                                  push_ws);
2575 }
2576 
2577 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2578                                 enum sched_type schedule, kmp_uint32 lb,
2579                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2580                                 int push_ws) {
2581   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2582                                   push_ws);
2583 }
2584 
2585 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2586                                enum sched_type schedule, kmp_int64 lb,
2587                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2588                                int push_ws) {
2589   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2590                                  push_ws);
2591 }
2592 
2593 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2594                                 enum sched_type schedule, kmp_uint64 lb,
2595                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2596                                 int push_ws) {
2597   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2598                                   push_ws);
2599 }
2600 
2601 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2602   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2603 }
2604 
2605 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2606   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2607 }
2608 
2609 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2610   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2611 }
2612 
2613 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2614   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2615 }
2616 
2617 #endif /* KMP_GOMP_COMPAT */
2618 
2619 /* ------------------------------------------------------------------------ */
2620