xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_dispatch.cpp (revision 32a95656b51ebefcdf3e0b02c110825f59abd26f)
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73                                          bool use_hier = false) {
74   // Pick up the nonmonotonic/monotonic bits from the scheduling type
75   // TODO: make nonmonotonic when static_steal is fixed
76   int monotonicity = SCHEDULE_MONOTONIC;
77 
78   // Let default be monotonic for executables
79   // compiled with OpenMP* 4.5 or less compilers
80   if (loc != NULL && loc->get_openmp_version() < 50)
81     monotonicity = SCHEDULE_MONOTONIC;
82 
83   if (use_hier || __kmp_force_monotonic)
84     monotonicity = SCHEDULE_MONOTONIC;
85   else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86     monotonicity = SCHEDULE_NONMONOTONIC;
87   else if (SCHEDULE_HAS_MONOTONIC(schedule))
88     monotonicity = SCHEDULE_MONOTONIC;
89 
90   return monotonicity;
91 }
92 
93 #if KMP_STATIC_STEAL_ENABLED
94 enum { // values for steal_flag (possible states of private per-loop buffer)
95   UNUSED = 0,
96   CLAIMED = 1, // owner thread started initialization
97   READY = 2, // available for stealing
98   THIEF = 3 // finished by owner, or claimed by thief
99   // possible state changes:
100   // 0 -> 1 owner only, sync
101   // 0 -> 3 thief only, sync
102   // 1 -> 2 owner only, async
103   // 2 -> 3 owner only, async
104   // 3 -> 2 owner only, async
105   // 3 -> 0 last thread finishing the loop, async
106 };
107 #endif
108 
109 // Initialize a dispatch_private_info_template<T> buffer for a particular
110 // type of schedule,chunk.  The loop description is found in lb (lower bound),
111 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
112 // to the scheduling (often the number of threads in a team, but not always if
113 // hierarchical scheduling is used).  tid is the id of the thread calling
114 // the function within the group of nproc threads.  It will have a value
115 // between 0 and nproc - 1.  This is often just the thread id within a team, but
116 // is not necessarily the case when using hierarchical scheduling.
117 // loc is the source file location of the corresponding loop
118 // gtid is the global thread id
119 template <typename T>
120 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
121                                    dispatch_private_info_template<T> *pr,
122                                    enum sched_type schedule, T lb, T ub,
123                                    typename traits_t<T>::signed_t st,
124 #if USE_ITT_BUILD
125                                    kmp_uint64 *cur_chunk,
126 #endif
127                                    typename traits_t<T>::signed_t chunk,
128                                    T nproc, T tid) {
129   typedef typename traits_t<T>::unsigned_t UT;
130   typedef typename traits_t<T>::floating_t DBL;
131 
132   int active;
133   T tc;
134   kmp_info_t *th;
135   kmp_team_t *team;
136   int monotonicity;
137   bool use_hier;
138 
139 #ifdef KMP_DEBUG
140   typedef typename traits_t<T>::signed_t ST;
141   {
142     char *buff;
143     // create format specifiers before the debug output
144     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
145                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
146                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
147                             traits_t<T>::spec, traits_t<T>::spec,
148                             traits_t<ST>::spec, traits_t<ST>::spec,
149                             traits_t<T>::spec, traits_t<T>::spec);
150     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
151     __kmp_str_free(&buff);
152   }
153 #endif
154   /* setup data */
155   th = __kmp_threads[gtid];
156   team = th->th.th_team;
157   active = !team->t.t_serialized;
158 
159 #if USE_ITT_BUILD
160   int itt_need_metadata_reporting =
161       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
162       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
163       team->t.t_active_level == 1;
164 #endif
165 
166 #if KMP_USE_HIER_SCHED
167   use_hier = pr->flags.use_hier;
168 #else
169   use_hier = false;
170 #endif
171 
172   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
173   monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
174   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
175 
176   /* Pick up the nomerge/ordered bits from the scheduling type */
177   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
178     pr->flags.nomerge = TRUE;
179     schedule =
180         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
181   } else {
182     pr->flags.nomerge = FALSE;
183   }
184   pr->type_size = traits_t<T>::type_size; // remember the size of variables
185   if (kmp_ord_lower & schedule) {
186     pr->flags.ordered = TRUE;
187     schedule =
188         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
189   } else {
190     pr->flags.ordered = FALSE;
191   }
192   // Ordered overrides nonmonotonic
193   if (pr->flags.ordered) {
194     monotonicity = SCHEDULE_MONOTONIC;
195   }
196 
197   if (schedule == kmp_sch_static) {
198     schedule = __kmp_static;
199   } else {
200     if (schedule == kmp_sch_runtime) {
201       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
202       // not specified)
203       schedule = team->t.t_sched.r_sched_type;
204       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
205       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
206       if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
207         monotonicity = SCHEDULE_MONOTONIC;
208       // Detail the schedule if needed (global controls are differentiated
209       // appropriately)
210       if (schedule == kmp_sch_guided_chunked) {
211         schedule = __kmp_guided;
212       } else if (schedule == kmp_sch_static) {
213         schedule = __kmp_static;
214       }
215       // Use the chunk size specified by OMP_SCHEDULE (or default if not
216       // specified)
217       chunk = team->t.t_sched.chunk;
218 #if USE_ITT_BUILD
219       if (cur_chunk)
220         *cur_chunk = chunk;
221 #endif
222 #ifdef KMP_DEBUG
223       {
224         char *buff;
225         // create format specifiers before the debug output
226         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
227                                 "schedule:%%d chunk:%%%s\n",
228                                 traits_t<ST>::spec);
229         KD_TRACE(10, (buff, gtid, schedule, chunk));
230         __kmp_str_free(&buff);
231       }
232 #endif
233     } else {
234       if (schedule == kmp_sch_guided_chunked) {
235         schedule = __kmp_guided;
236       }
237       if (chunk <= 0) {
238         chunk = KMP_DEFAULT_CHUNK;
239       }
240     }
241 
242     if (schedule == kmp_sch_auto) {
243       // mapping and differentiation: in the __kmp_do_serial_initialize()
244       schedule = __kmp_auto;
245 #ifdef KMP_DEBUG
246       {
247         char *buff;
248         // create format specifiers before the debug output
249         buff = __kmp_str_format(
250             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
251             "schedule:%%d chunk:%%%s\n",
252             traits_t<ST>::spec);
253         KD_TRACE(10, (buff, gtid, schedule, chunk));
254         __kmp_str_free(&buff);
255       }
256 #endif
257     }
258 #if KMP_STATIC_STEAL_ENABLED
259     // map nonmonotonic:dynamic to static steal
260     if (schedule == kmp_sch_dynamic_chunked) {
261       if (monotonicity == SCHEDULE_NONMONOTONIC)
262         schedule = kmp_sch_static_steal;
263     }
264 #endif
265     /* guided analytical not safe for too many threads */
266     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
267       schedule = kmp_sch_guided_iterative_chunked;
268       KMP_WARNING(DispatchManyThreads);
269     }
270     if (schedule == kmp_sch_runtime_simd) {
271       // compiler provides simd_width in the chunk parameter
272       schedule = team->t.t_sched.r_sched_type;
273       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
274       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
275       // Detail the schedule if needed (global controls are differentiated
276       // appropriately)
277       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
278           schedule == __kmp_static) {
279         schedule = kmp_sch_static_balanced_chunked;
280       } else {
281         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
282           schedule = kmp_sch_guided_simd;
283         }
284         chunk = team->t.t_sched.chunk * chunk;
285       }
286 #if USE_ITT_BUILD
287       if (cur_chunk)
288         *cur_chunk = chunk;
289 #endif
290 #ifdef KMP_DEBUG
291       {
292         char *buff;
293         // create format specifiers before the debug output
294         buff = __kmp_str_format(
295             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
296             " chunk:%%%s\n",
297             traits_t<ST>::spec);
298         KD_TRACE(10, (buff, gtid, schedule, chunk));
299         __kmp_str_free(&buff);
300       }
301 #endif
302     }
303     pr->u.p.parm1 = chunk;
304   }
305   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
306               "unknown scheduling type");
307 
308   pr->u.p.count = 0;
309 
310   if (__kmp_env_consistency_check) {
311     if (st == 0) {
312       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
313                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
314     }
315   }
316   // compute trip count
317   if (st == 1) { // most common case
318     if (ub >= lb) {
319       tc = ub - lb + 1;
320     } else { // ub < lb
321       tc = 0; // zero-trip
322     }
323   } else if (st < 0) {
324     if (lb >= ub) {
325       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
326       // where the division needs to be unsigned regardless of the result type
327       tc = (UT)(lb - ub) / (-st) + 1;
328     } else { // lb < ub
329       tc = 0; // zero-trip
330     }
331   } else { // st > 0
332     if (ub >= lb) {
333       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
334       // where the division needs to be unsigned regardless of the result type
335       tc = (UT)(ub - lb) / st + 1;
336     } else { // ub < lb
337       tc = 0; // zero-trip
338     }
339   }
340 
341 #if KMP_STATS_ENABLED
342   if (KMP_MASTER_GTID(gtid)) {
343     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
344   }
345 #endif
346 
347   pr->u.p.lb = lb;
348   pr->u.p.ub = ub;
349   pr->u.p.st = st;
350   pr->u.p.tc = tc;
351 
352 #if KMP_OS_WINDOWS
353   pr->u.p.last_upper = ub + st;
354 #endif /* KMP_OS_WINDOWS */
355 
356   /* NOTE: only the active parallel region(s) has active ordered sections */
357 
358   if (active) {
359     if (pr->flags.ordered) {
360       pr->ordered_bumped = 0;
361       pr->u.p.ordered_lower = 1;
362       pr->u.p.ordered_upper = 0;
363     }
364   }
365 
366   switch (schedule) {
367 #if KMP_STATIC_STEAL_ENABLED
368   case kmp_sch_static_steal: {
369     T ntc, init;
370 
371     KD_TRACE(100,
372              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
373               gtid));
374 
375     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
376     if (nproc > 1 && ntc >= nproc) {
377       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
378       T id = tid;
379       T small_chunk, extras;
380       kmp_uint32 old = UNUSED;
381       int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
382       if (traits_t<T>::type_size > 4) {
383         // AC: TODO: check if 16-byte CAS available and use it to
384         // improve performance (probably wait for explicit request
385         // before spending time on this).
386         // For now use dynamically allocated per-private-buffer lock,
387         // free memory in __kmp_dispatch_next when status==0.
388         pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
389         __kmp_init_lock(pr->u.p.steal_lock);
390       }
391       small_chunk = ntc / nproc;
392       extras = ntc % nproc;
393 
394       init = id * small_chunk + (id < extras ? id : extras);
395       pr->u.p.count = init;
396       if (claimed) { // are we succeeded in claiming own buffer?
397         pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
398         // Other threads will inspect steal_flag when searching for a victim.
399         // READY means other threads may steal from this thread from now on.
400         KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
401       } else {
402         // other thread has stolen whole our range
403         KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
404         pr->u.p.ub = init; // mark there is no iterations to work on
405       }
406       pr->u.p.parm2 = ntc; // save number of chunks
407       // parm3 is the number of times to attempt stealing which is
408       // nproc (just a heuristics, could be optimized later on).
409       pr->u.p.parm3 = nproc;
410       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
411       break;
412     } else {
413       /* too few chunks: switching to kmp_sch_dynamic_chunked */
414       schedule = kmp_sch_dynamic_chunked;
415       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
416                      "kmp_sch_dynamic_chunked\n",
417                      gtid));
418       goto dynamic_init;
419       break;
420     } // if
421   } // case
422 #endif
423   case kmp_sch_static_balanced: {
424     T init, limit;
425 
426     KD_TRACE(
427         100,
428         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
429          gtid));
430 
431     if (nproc > 1) {
432       T id = tid;
433 
434       if (tc < nproc) {
435         if (id < tc) {
436           init = id;
437           limit = id;
438           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
439         } else {
440           pr->u.p.count = 1; /* means no more chunks to execute */
441           pr->u.p.parm1 = FALSE;
442           break;
443         }
444       } else {
445         T small_chunk = tc / nproc;
446         T extras = tc % nproc;
447         init = id * small_chunk + (id < extras ? id : extras);
448         limit = init + small_chunk - (id < extras ? 0 : 1);
449         pr->u.p.parm1 = (id == nproc - 1);
450       }
451     } else {
452       if (tc > 0) {
453         init = 0;
454         limit = tc - 1;
455         pr->u.p.parm1 = TRUE;
456       } else {
457         // zero trip count
458         pr->u.p.count = 1; /* means no more chunks to execute */
459         pr->u.p.parm1 = FALSE;
460         break;
461       }
462     }
463 #if USE_ITT_BUILD
464     // Calculate chunk for metadata report
465     if (itt_need_metadata_reporting)
466       if (cur_chunk)
467         *cur_chunk = limit - init + 1;
468 #endif
469     if (st == 1) {
470       pr->u.p.lb = lb + init;
471       pr->u.p.ub = lb + limit;
472     } else {
473       // calculated upper bound, "ub" is user-defined upper bound
474       T ub_tmp = lb + limit * st;
475       pr->u.p.lb = lb + init * st;
476       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
477       // it exactly
478       if (st > 0) {
479         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
480       } else {
481         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
482       }
483     }
484     if (pr->flags.ordered) {
485       pr->u.p.ordered_lower = init;
486       pr->u.p.ordered_upper = limit;
487     }
488     break;
489   } // case
490   case kmp_sch_static_balanced_chunked: {
491     // similar to balanced, but chunk adjusted to multiple of simd width
492     T nth = nproc;
493     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
494                    " -> falling-through to static_greedy\n",
495                    gtid));
496     schedule = kmp_sch_static_greedy;
497     if (nth > 1)
498       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
499     else
500       pr->u.p.parm1 = tc;
501     break;
502   } // case
503   case kmp_sch_guided_simd:
504   case kmp_sch_guided_iterative_chunked: {
505     KD_TRACE(
506         100,
507         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
508          " case\n",
509          gtid));
510 
511     if (nproc > 1) {
512       if ((2L * chunk + 1) * nproc >= tc) {
513         /* chunk size too large, switch to dynamic */
514         schedule = kmp_sch_dynamic_chunked;
515         goto dynamic_init;
516       } else {
517         // when remaining iters become less than parm2 - switch to dynamic
518         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
519         *(double *)&pr->u.p.parm3 =
520             guided_flt_param / (double)nproc; // may occupy parm3 and parm4
521       }
522     } else {
523       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
524                      "kmp_sch_static_greedy\n",
525                      gtid));
526       schedule = kmp_sch_static_greedy;
527       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
528       KD_TRACE(
529           100,
530           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
531            gtid));
532       pr->u.p.parm1 = tc;
533     } // if
534   } // case
535   break;
536   case kmp_sch_guided_analytical_chunked: {
537     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
538                    "kmp_sch_guided_analytical_chunked case\n",
539                    gtid));
540 
541     if (nproc > 1) {
542       if ((2L * chunk + 1) * nproc >= tc) {
543         /* chunk size too large, switch to dynamic */
544         schedule = kmp_sch_dynamic_chunked;
545         goto dynamic_init;
546       } else {
547         /* commonly used term: (2 nproc - 1)/(2 nproc) */
548         DBL x;
549 
550 #if KMP_USE_X87CONTROL
551         /* Linux* OS already has 64-bit computation by default for long double,
552            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
553            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
554            instead of the default 53-bit. Even though long double doesn't work
555            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
556            expected to impact the correctness of the algorithm, but this has not
557            been mathematically proven. */
558         // save original FPCW and set precision to 64-bit, as
559         // Windows* OS on IA-32 architecture defaults to 53-bit
560         unsigned int oldFpcw = _control87(0, 0);
561         _control87(_PC_64, _MCW_PC); // 0,0x30000
562 #endif
563         /* value used for comparison in solver for cross-over point */
564         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
565 
566         /* crossover point--chunk indexes equal to or greater than
567            this point switch to dynamic-style scheduling */
568         UT cross;
569 
570         /* commonly used term: (2 nproc - 1)/(2 nproc) */
571         x = 1.0 - 0.5 / (double)nproc;
572 
573 #ifdef KMP_DEBUG
574         { // test natural alignment
575           struct _test_a {
576             char a;
577             union {
578               char b;
579               DBL d;
580             };
581           } t;
582           ptrdiff_t natural_alignment =
583               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
584           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
585           // long)natural_alignment );
586           KMP_DEBUG_ASSERT(
587               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
588         }
589 #endif // KMP_DEBUG
590 
591         /* save the term in thread private dispatch structure */
592         *(DBL *)&pr->u.p.parm3 = x;
593 
594         /* solve for the crossover point to the nearest integer i for which C_i
595            <= chunk */
596         {
597           UT left, right, mid;
598           long double p;
599 
600           /* estimate initial upper and lower bound */
601 
602           /* doesn't matter what value right is as long as it is positive, but
603              it affects performance of the solver */
604           right = 229;
605           p = __kmp_pow<UT>(x, right);
606           if (p > target) {
607             do {
608               p *= p;
609               right <<= 1;
610             } while (p > target && right < (1 << 27));
611             /* lower bound is previous (failed) estimate of upper bound */
612             left = right >> 1;
613           } else {
614             left = 0;
615           }
616 
617           /* bisection root-finding method */
618           while (left + 1 < right) {
619             mid = (left + right) / 2;
620             if (__kmp_pow<UT>(x, mid) > target) {
621               left = mid;
622             } else {
623               right = mid;
624             }
625           } // while
626           cross = right;
627         }
628         /* assert sanity of computed crossover point */
629         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
630                    __kmp_pow<UT>(x, cross) <= target);
631 
632         /* save the crossover point in thread private dispatch structure */
633         pr->u.p.parm2 = cross;
634 
635 // C75803
636 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
637 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
638 #else
639 #define GUIDED_ANALYTICAL_WORKAROUND (x)
640 #endif
641         /* dynamic-style scheduling offset */
642         pr->u.p.count = tc -
643                         __kmp_dispatch_guided_remaining(
644                             tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
645                         cross * chunk;
646 #if KMP_USE_X87CONTROL
647         // restore FPCW
648         _control87(oldFpcw, _MCW_PC);
649 #endif
650       } // if
651     } else {
652       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
653                      "kmp_sch_static_greedy\n",
654                      gtid));
655       schedule = kmp_sch_static_greedy;
656       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
657       pr->u.p.parm1 = tc;
658     } // if
659   } // case
660   break;
661   case kmp_sch_static_greedy:
662     KD_TRACE(
663         100,
664         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
665          gtid));
666     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
667     break;
668   case kmp_sch_static_chunked:
669   case kmp_sch_dynamic_chunked:
670   dynamic_init:
671     if (pr->u.p.parm1 <= 0)
672       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
673     else if (pr->u.p.parm1 > tc)
674       pr->u.p.parm1 = tc;
675     // Store the total number of chunks to prevent integer overflow during
676     // bounds calculations in the get next chunk routine.
677     pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
678     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
679                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
680                    gtid));
681     break;
682   case kmp_sch_trapezoidal: {
683     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
684 
685     T parm1, parm2, parm3, parm4;
686     KD_TRACE(100,
687              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
688               gtid));
689 
690     parm1 = chunk;
691 
692     /* F : size of the first cycle */
693     parm2 = (tc / (2 * nproc));
694 
695     if (parm2 < 1) {
696       parm2 = 1;
697     }
698 
699     /* L : size of the last cycle.  Make sure the last cycle is not larger
700        than the first cycle. */
701     if (parm1 < 1) {
702       parm1 = 1;
703     } else if (parm1 > parm2) {
704       parm1 = parm2;
705     }
706 
707     /* N : number of cycles */
708     parm3 = (parm2 + parm1);
709     parm3 = (2 * tc + parm3 - 1) / parm3;
710 
711     if (parm3 < 2) {
712       parm3 = 2;
713     }
714 
715     /* sigma : decreasing incr of the trapezoid */
716     parm4 = (parm3 - 1);
717     parm4 = (parm2 - parm1) / parm4;
718 
719     // pointless check, because parm4 >= 0 always
720     // if ( parm4 < 0 ) {
721     //    parm4 = 0;
722     //}
723 
724     pr->u.p.parm1 = parm1;
725     pr->u.p.parm2 = parm2;
726     pr->u.p.parm3 = parm3;
727     pr->u.p.parm4 = parm4;
728   } // case
729   break;
730 
731   default: {
732     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
733                 KMP_HNT(GetNewerLibrary), // Hint
734                 __kmp_msg_null // Variadic argument list terminator
735     );
736   } break;
737   } // switch
738   pr->schedule = schedule;
739 }
740 
741 #if KMP_USE_HIER_SCHED
742 template <typename T>
743 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
744                                              typename traits_t<T>::signed_t st);
745 template <>
746 inline void
747 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
748                                             kmp_int32 ub, kmp_int32 st) {
749   __kmp_dispatch_init_hierarchy<kmp_int32>(
750       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
751       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
752 }
753 template <>
754 inline void
755 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
756                                              kmp_uint32 ub, kmp_int32 st) {
757   __kmp_dispatch_init_hierarchy<kmp_uint32>(
758       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
759       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
760 }
761 template <>
762 inline void
763 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
764                                             kmp_int64 ub, kmp_int64 st) {
765   __kmp_dispatch_init_hierarchy<kmp_int64>(
766       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
767       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
768 }
769 template <>
770 inline void
771 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
772                                              kmp_uint64 ub, kmp_int64 st) {
773   __kmp_dispatch_init_hierarchy<kmp_uint64>(
774       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
775       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
776 }
777 
778 // free all the hierarchy scheduling memory associated with the team
779 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
780   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
781   for (int i = 0; i < num_disp_buff; ++i) {
782     // type does not matter here so use kmp_int32
783     auto sh =
784         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
785             &team->t.t_disp_buffer[i]);
786     if (sh->hier) {
787       sh->hier->deallocate();
788       __kmp_free(sh->hier);
789     }
790   }
791 }
792 #endif
793 
794 // UT - unsigned flavor of T, ST - signed flavor of T,
795 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
796 template <typename T>
797 static void
798 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
799                     T ub, typename traits_t<T>::signed_t st,
800                     typename traits_t<T>::signed_t chunk, int push_ws) {
801   typedef typename traits_t<T>::unsigned_t UT;
802 
803   int active;
804   kmp_info_t *th;
805   kmp_team_t *team;
806   kmp_uint32 my_buffer_index;
807   dispatch_private_info_template<T> *pr;
808   dispatch_shared_info_template<T> volatile *sh;
809 
810   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
811                    sizeof(dispatch_private_info));
812   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
813                    sizeof(dispatch_shared_info));
814   __kmp_assert_valid_gtid(gtid);
815 
816   if (!TCR_4(__kmp_init_parallel))
817     __kmp_parallel_initialize();
818 
819   __kmp_resume_if_soft_paused();
820 
821 #if INCLUDE_SSC_MARKS
822   SSC_MARK_DISPATCH_INIT();
823 #endif
824 #ifdef KMP_DEBUG
825   typedef typename traits_t<T>::signed_t ST;
826   {
827     char *buff;
828     // create format specifiers before the debug output
829     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
830                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
831                             traits_t<ST>::spec, traits_t<T>::spec,
832                             traits_t<T>::spec, traits_t<ST>::spec);
833     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
834     __kmp_str_free(&buff);
835   }
836 #endif
837   /* setup data */
838   th = __kmp_threads[gtid];
839   team = th->th.th_team;
840   active = !team->t.t_serialized;
841   th->th.th_ident = loc;
842 
843   // Any half-decent optimizer will remove this test when the blocks are empty
844   // since the macros expand to nothing
845   // when statistics are disabled.
846   if (schedule == __kmp_static) {
847     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
848   } else {
849     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
850   }
851 
852 #if KMP_USE_HIER_SCHED
853   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
854   // Hierarchical scheduling does not work with ordered, so if ordered is
855   // detected, then revert back to threaded scheduling.
856   bool ordered;
857   enum sched_type my_sched = schedule;
858   my_buffer_index = th->th.th_dispatch->th_disp_index;
859   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
860       &th->th.th_dispatch
861            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
862   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
863   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
864     my_sched =
865         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
866   ordered = (kmp_ord_lower & my_sched);
867   if (pr->flags.use_hier) {
868     if (ordered) {
869       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
870                      "Disabling hierarchical scheduling.\n",
871                      gtid));
872       pr->flags.use_hier = FALSE;
873     }
874   }
875   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
876     // Don't use hierarchical for ordered parallel loops and don't
877     // use the runtime hierarchy if one was specified in the program
878     if (!ordered && !pr->flags.use_hier)
879       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
880   }
881 #endif // KMP_USE_HIER_SCHED
882 
883 #if USE_ITT_BUILD
884   kmp_uint64 cur_chunk = chunk;
885   int itt_need_metadata_reporting =
886       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
887       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
888       team->t.t_active_level == 1;
889 #endif
890   if (!active) {
891     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
892         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
893   } else {
894     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
895                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
896 
897     my_buffer_index = th->th.th_dispatch->th_disp_index++;
898 
899     /* What happens when number of threads changes, need to resize buffer? */
900     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
901         &th->th.th_dispatch
902              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
903     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
904         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
905     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
906                   my_buffer_index));
907     if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
908       KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
909                      " sh->buffer_index:%d\n",
910                      gtid, my_buffer_index, sh->buffer_index));
911       __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
912                              __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
913       // Note: KMP_WAIT() cannot be used there: buffer index and
914       // my_buffer_index are *always* 32-bit integers.
915       KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
916                      "sh->buffer_index:%d\n",
917                      gtid, my_buffer_index, sh->buffer_index));
918     }
919   }
920 
921   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
922 #if USE_ITT_BUILD
923                                 &cur_chunk,
924 #endif
925                                 chunk, (T)th->th.th_team_nproc,
926                                 (T)th->th.th_info.ds.ds_tid);
927   if (active) {
928     if (pr->flags.ordered == 0) {
929       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
930       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
931     } else {
932       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
933       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
934     }
935     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
936     th->th.th_dispatch->th_dispatch_sh_current =
937         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
938 #if USE_ITT_BUILD
939     if (pr->flags.ordered) {
940       __kmp_itt_ordered_init(gtid);
941     }
942     // Report loop metadata
943     if (itt_need_metadata_reporting) {
944       // Only report metadata by primary thread of active team at level 1
945       kmp_uint64 schedtype = 0;
946       switch (schedule) {
947       case kmp_sch_static_chunked:
948       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
949         break;
950       case kmp_sch_static_greedy:
951         cur_chunk = pr->u.p.parm1;
952         break;
953       case kmp_sch_dynamic_chunked:
954         schedtype = 1;
955         break;
956       case kmp_sch_guided_iterative_chunked:
957       case kmp_sch_guided_analytical_chunked:
958       case kmp_sch_guided_simd:
959         schedtype = 2;
960         break;
961       default:
962         // Should we put this case under "static"?
963         // case kmp_sch_static_steal:
964         schedtype = 3;
965         break;
966       }
967       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
968     }
969 #if KMP_USE_HIER_SCHED
970     if (pr->flags.use_hier) {
971       pr->u.p.count = 0;
972       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
973     }
974 #endif // KMP_USER_HIER_SCHED
975 #endif /* USE_ITT_BUILD */
976   }
977 
978 #ifdef KMP_DEBUG
979   {
980     char *buff;
981     // create format specifiers before the debug output
982     buff = __kmp_str_format(
983         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
984         "lb:%%%s ub:%%%s"
985         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
986         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
987         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
988         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
989         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
990         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
991     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
992                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
993                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
994                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
995     __kmp_str_free(&buff);
996   }
997 #endif
998 #if OMPT_SUPPORT && OMPT_OPTIONAL
999   if (ompt_enabled.ompt_callback_work) {
1000     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1001     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1002     ompt_callbacks.ompt_callback(ompt_callback_work)(
1003         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1004         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1005   }
1006 #endif
1007   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1008 }
1009 
1010 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1011  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1012  * every chunk of iterations.  If the ordered section(s) were not executed
1013  * for this iteration (or every iteration in this chunk), we need to set the
1014  * ordered iteration counters so that the next thread can proceed. */
1015 template <typename UT>
1016 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1017   typedef typename traits_t<UT>::signed_t ST;
1018   __kmp_assert_valid_gtid(gtid);
1019   kmp_info_t *th = __kmp_threads[gtid];
1020 
1021   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1022   if (!th->th.th_team->t.t_serialized) {
1023 
1024     dispatch_private_info_template<UT> *pr =
1025         reinterpret_cast<dispatch_private_info_template<UT> *>(
1026             th->th.th_dispatch->th_dispatch_pr_current);
1027     dispatch_shared_info_template<UT> volatile *sh =
1028         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1029             th->th.th_dispatch->th_dispatch_sh_current);
1030     KMP_DEBUG_ASSERT(pr);
1031     KMP_DEBUG_ASSERT(sh);
1032     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1033                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1034 
1035     if (pr->ordered_bumped) {
1036       KD_TRACE(
1037           1000,
1038           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1039            gtid));
1040       pr->ordered_bumped = 0;
1041     } else {
1042       UT lower = pr->u.p.ordered_lower;
1043 
1044 #ifdef KMP_DEBUG
1045       {
1046         char *buff;
1047         // create format specifiers before the debug output
1048         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1049                                 "ordered_iteration:%%%s lower:%%%s\n",
1050                                 traits_t<UT>::spec, traits_t<UT>::spec);
1051         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1052         __kmp_str_free(&buff);
1053       }
1054 #endif
1055 
1056       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1057                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1058       KMP_MB(); /* is this necessary? */
1059 #ifdef KMP_DEBUG
1060       {
1061         char *buff;
1062         // create format specifiers before the debug output
1063         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1064                                 "ordered_iteration:%%%s lower:%%%s\n",
1065                                 traits_t<UT>::spec, traits_t<UT>::spec);
1066         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1067         __kmp_str_free(&buff);
1068       }
1069 #endif
1070 
1071       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1072     } // if
1073   } // if
1074   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1075 }
1076 
1077 #ifdef KMP_GOMP_COMPAT
1078 
1079 template <typename UT>
1080 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1081   typedef typename traits_t<UT>::signed_t ST;
1082   __kmp_assert_valid_gtid(gtid);
1083   kmp_info_t *th = __kmp_threads[gtid];
1084 
1085   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1086   if (!th->th.th_team->t.t_serialized) {
1087     dispatch_private_info_template<UT> *pr =
1088         reinterpret_cast<dispatch_private_info_template<UT> *>(
1089             th->th.th_dispatch->th_dispatch_pr_current);
1090     dispatch_shared_info_template<UT> volatile *sh =
1091         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1092             th->th.th_dispatch->th_dispatch_sh_current);
1093     KMP_DEBUG_ASSERT(pr);
1094     KMP_DEBUG_ASSERT(sh);
1095     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1096                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1097 
1098     UT lower = pr->u.p.ordered_lower;
1099     UT upper = pr->u.p.ordered_upper;
1100     UT inc = upper - lower + 1;
1101 
1102     if (pr->ordered_bumped == inc) {
1103       KD_TRACE(
1104           1000,
1105           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1106            gtid));
1107       pr->ordered_bumped = 0;
1108     } else {
1109       inc -= pr->ordered_bumped;
1110 
1111 #ifdef KMP_DEBUG
1112       {
1113         char *buff;
1114         // create format specifiers before the debug output
1115         buff = __kmp_str_format(
1116             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1117             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1118             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1119         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1120         __kmp_str_free(&buff);
1121       }
1122 #endif
1123 
1124       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1125                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1126 
1127       KMP_MB(); /* is this necessary? */
1128       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1129                       "ordered_bumped to zero\n",
1130                       gtid));
1131       pr->ordered_bumped = 0;
1132 //!!!!! TODO check if the inc should be unsigned, or signed???
1133 #ifdef KMP_DEBUG
1134       {
1135         char *buff;
1136         // create format specifiers before the debug output
1137         buff = __kmp_str_format(
1138             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1139             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1140             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1141             traits_t<UT>::spec);
1142         KD_TRACE(1000,
1143                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1144         __kmp_str_free(&buff);
1145       }
1146 #endif
1147 
1148       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1149     }
1150     //        }
1151   }
1152   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1153 }
1154 
1155 #endif /* KMP_GOMP_COMPAT */
1156 
1157 template <typename T>
1158 int __kmp_dispatch_next_algorithm(int gtid,
1159                                   dispatch_private_info_template<T> *pr,
1160                                   dispatch_shared_info_template<T> volatile *sh,
1161                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1162                                   typename traits_t<T>::signed_t *p_st, T nproc,
1163                                   T tid) {
1164   typedef typename traits_t<T>::unsigned_t UT;
1165   typedef typename traits_t<T>::signed_t ST;
1166   typedef typename traits_t<T>::floating_t DBL;
1167   int status = 0;
1168   bool last = false;
1169   T start;
1170   ST incr;
1171   UT limit, trip, init;
1172   kmp_info_t *th = __kmp_threads[gtid];
1173   kmp_team_t *team = th->th.th_team;
1174 
1175   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1176                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1177   KMP_DEBUG_ASSERT(pr);
1178   KMP_DEBUG_ASSERT(sh);
1179   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1180 #ifdef KMP_DEBUG
1181   {
1182     char *buff;
1183     // create format specifiers before the debug output
1184     buff =
1185         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1186                          "sh:%%p nproc:%%%s tid:%%%s\n",
1187                          traits_t<T>::spec, traits_t<T>::spec);
1188     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1189     __kmp_str_free(&buff);
1190   }
1191 #endif
1192 
1193   // zero trip count
1194   if (pr->u.p.tc == 0) {
1195     KD_TRACE(10,
1196              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1197               "zero status:%d\n",
1198               gtid, status));
1199     return 0;
1200   }
1201 
1202   switch (pr->schedule) {
1203 #if KMP_STATIC_STEAL_ENABLED
1204   case kmp_sch_static_steal: {
1205     T chunk = pr->u.p.parm1;
1206     UT nchunks = pr->u.p.parm2;
1207     KD_TRACE(100,
1208              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1209               gtid));
1210 
1211     trip = pr->u.p.tc - 1;
1212 
1213     if (traits_t<T>::type_size > 4) {
1214       // use lock for 8-byte induction variable.
1215       // TODO (optional): check presence and use 16-byte CAS
1216       kmp_lock_t *lck = pr->u.p.steal_lock;
1217       KMP_DEBUG_ASSERT(lck != NULL);
1218       if (pr->u.p.count < (UT)pr->u.p.ub) {
1219         KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1220         __kmp_acquire_lock(lck, gtid);
1221         // try to get own chunk of iterations
1222         init = (pr->u.p.count)++;
1223         status = (init < (UT)pr->u.p.ub);
1224         __kmp_release_lock(lck, gtid);
1225       } else {
1226         status = 0; // no own chunks
1227       }
1228       if (!status) { // try to steal
1229         kmp_lock_t *lckv; // victim buffer's lock
1230         T while_limit = pr->u.p.parm3;
1231         T while_index = 0;
1232         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1233                   __kmp_dispatch_num_buffers; // current loop index
1234         // note: victim thread can potentially execute another loop
1235         KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1236         while ((!status) && (while_limit != ++while_index)) {
1237           dispatch_private_info_template<T> *v;
1238           T remaining;
1239           T victimId = pr->u.p.parm4;
1240           T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1241           v = reinterpret_cast<dispatch_private_info_template<T> *>(
1242               &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1243           KMP_DEBUG_ASSERT(v);
1244           while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1245                  oldVictimId != victimId) {
1246             victimId = (victimId + 1) % nproc;
1247             v = reinterpret_cast<dispatch_private_info_template<T> *>(
1248                 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1249             KMP_DEBUG_ASSERT(v);
1250           }
1251           if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1252             continue; // try once more (nproc attempts in total)
1253           }
1254           if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1255             kmp_uint32 old = UNUSED;
1256             // try to steal whole range from inactive victim
1257             status = v->steal_flag.compare_exchange_strong(old, THIEF);
1258             if (status) {
1259               // initialize self buffer with victim's whole range of chunks
1260               T id = victimId;
1261               T small_chunk, extras;
1262               small_chunk = nchunks / nproc; // chunks per thread
1263               extras = nchunks % nproc;
1264               init = id * small_chunk + (id < extras ? id : extras);
1265               __kmp_acquire_lock(lck, gtid);
1266               pr->u.p.count = init + 1; // exclude one we execute immediately
1267               pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1268               __kmp_release_lock(lck, gtid);
1269               pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1270               // no need to reinitialize other thread invariants: lb, st, etc.
1271 #ifdef KMP_DEBUG
1272               {
1273                 char *buff;
1274                 // create format specifiers before the debug output
1275                 buff = __kmp_str_format(
1276                     "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1277                     "count:%%%s ub:%%%s\n",
1278                     traits_t<UT>::spec, traits_t<T>::spec);
1279                 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1280                 __kmp_str_free(&buff);
1281               }
1282 #endif
1283               // activate non-empty buffer and let others steal from us
1284               if (pr->u.p.count < (UT)pr->u.p.ub)
1285                 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1286               break;
1287             }
1288           }
1289           if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY ||
1290               v->u.p.count >= (UT)v->u.p.ub) {
1291             pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1292             continue; // no chunks to steal, try next victim
1293           }
1294           lckv = v->u.p.steal_lock;
1295           KMP_ASSERT(lckv != NULL);
1296           __kmp_acquire_lock(lckv, gtid);
1297           limit = v->u.p.ub; // keep initial ub
1298           if (v->u.p.count >= limit) {
1299             __kmp_release_lock(lckv, gtid);
1300             pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1301             continue; // no chunks to steal, try next victim
1302           }
1303 
1304           // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1305           // TODO: is this heuristics good enough??
1306           remaining = limit - v->u.p.count;
1307           if (remaining > 7) {
1308             // steal 1/4 of remaining
1309             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1310             init = (v->u.p.ub -= (remaining >> 2));
1311           } else {
1312             // steal 1 chunk of 1..7 remaining
1313             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1314             init = (v->u.p.ub -= 1);
1315           }
1316           __kmp_release_lock(lckv, gtid);
1317 #ifdef KMP_DEBUG
1318           {
1319             char *buff;
1320             // create format specifiers before the debug output
1321             buff = __kmp_str_format(
1322                 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1323                 "count:%%%s ub:%%%s\n",
1324                 traits_t<UT>::spec, traits_t<UT>::spec);
1325             KD_TRACE(10, (buff, gtid, victimId, init, limit));
1326             __kmp_str_free(&buff);
1327           }
1328 #endif
1329           KMP_DEBUG_ASSERT(init + 1 <= limit);
1330           pr->u.p.parm4 = victimId; // remember victim to steal from
1331           status = 1;
1332           // now update own count and ub with stolen range excluding init chunk
1333           __kmp_acquire_lock(lck, gtid);
1334           pr->u.p.count = init + 1;
1335           pr->u.p.ub = limit;
1336           __kmp_release_lock(lck, gtid);
1337           // activate non-empty buffer and let others steal from us
1338           if (init + 1 < limit)
1339             KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1340         } // while (search for victim)
1341       } // if (try to find victim and steal)
1342     } else {
1343       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1344       // as all operations on pair (count, ub) must be done atomically
1345       typedef union {
1346         struct {
1347           UT count;
1348           T ub;
1349         } p;
1350         kmp_int64 b;
1351       } union_i4;
1352       union_i4 vold, vnew;
1353       if (pr->u.p.count < (UT)pr->u.p.ub) {
1354         KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1355         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1356         vnew.b = vold.b;
1357         vnew.p.count++; // get chunk from head of self range
1358         while (!KMP_COMPARE_AND_STORE_REL64(
1359             (volatile kmp_int64 *)&pr->u.p.count,
1360             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1361             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1362           KMP_CPU_PAUSE();
1363           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1364           vnew.b = vold.b;
1365           vnew.p.count++;
1366         }
1367         init = vold.p.count;
1368         status = (init < (UT)vold.p.ub);
1369       } else {
1370         status = 0; // no own chunks
1371       }
1372       if (!status) { // try to steal
1373         T while_limit = pr->u.p.parm3;
1374         T while_index = 0;
1375         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1376                   __kmp_dispatch_num_buffers; // current loop index
1377         // note: victim thread can potentially execute another loop
1378         KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1379         while ((!status) && (while_limit != ++while_index)) {
1380           dispatch_private_info_template<T> *v;
1381           T remaining;
1382           T victimId = pr->u.p.parm4;
1383           T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1384           v = reinterpret_cast<dispatch_private_info_template<T> *>(
1385               &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1386           KMP_DEBUG_ASSERT(v);
1387           while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1388                  oldVictimId != victimId) {
1389             victimId = (victimId + 1) % nproc;
1390             v = reinterpret_cast<dispatch_private_info_template<T> *>(
1391                 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1392             KMP_DEBUG_ASSERT(v);
1393           }
1394           if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1395             continue; // try once more (nproc attempts in total)
1396           }
1397           if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1398             kmp_uint32 old = UNUSED;
1399             // try to steal whole range from inactive victim
1400             status = v->steal_flag.compare_exchange_strong(old, THIEF);
1401             if (status) {
1402               // initialize self buffer with victim's whole range of chunks
1403               T id = victimId;
1404               T small_chunk, extras;
1405               small_chunk = nchunks / nproc; // chunks per thread
1406               extras = nchunks % nproc;
1407               init = id * small_chunk + (id < extras ? id : extras);
1408               vnew.p.count = init + 1;
1409               vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1410               // write pair (count, ub) at once atomically
1411 #if KMP_ARCH_X86
1412               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1413 #else
1414               *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1415 #endif
1416               pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1417               // no need to initialize other thread invariants: lb, st, etc.
1418 #ifdef KMP_DEBUG
1419               {
1420                 char *buff;
1421                 // create format specifiers before the debug output
1422                 buff = __kmp_str_format(
1423                     "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1424                     "count:%%%s ub:%%%s\n",
1425                     traits_t<UT>::spec, traits_t<T>::spec);
1426                 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1427                 __kmp_str_free(&buff);
1428               }
1429 #endif
1430               // activate non-empty buffer and let others steal from us
1431               if (pr->u.p.count < (UT)pr->u.p.ub)
1432                 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1433               break;
1434             }
1435           }
1436           while (1) { // CAS loop with check if victim still has enough chunks
1437             // many threads may be stealing concurrently from same victim
1438             vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
1439             if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1440                 vold.p.count >= (UT)vold.p.ub) {
1441               pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
1442               break; // no chunks to steal, try next victim
1443             }
1444             vnew.b = vold.b;
1445             remaining = vold.p.ub - vold.p.count;
1446             // try to steal 1/4 of remaining
1447             // TODO: is this heuristics good enough??
1448             if (remaining > 7) {
1449               vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
1450             } else {
1451               vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
1452             }
1453             KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1454             if (KMP_COMPARE_AND_STORE_REL64(
1455                     (volatile kmp_int64 *)&v->u.p.count,
1456                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1457                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1458               // stealing succedded
1459 #ifdef KMP_DEBUG
1460               {
1461                 char *buff;
1462                 // create format specifiers before the debug output
1463                 buff = __kmp_str_format(
1464                     "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1465                     "count:%%%s ub:%%%s\n",
1466                     traits_t<T>::spec, traits_t<T>::spec);
1467                 KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1468                 __kmp_str_free(&buff);
1469               }
1470 #endif
1471               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1472                                         vold.p.ub - vnew.p.ub);
1473               status = 1;
1474               pr->u.p.parm4 = victimId; // keep victim id
1475               // now update own count and ub
1476               init = vnew.p.ub;
1477               vold.p.count = init + 1;
1478 #if KMP_ARCH_X86
1479               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1480 #else
1481               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1482 #endif
1483               // activate non-empty buffer and let others steal from us
1484               if (vold.p.count < (UT)vold.p.ub)
1485                 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1486               break;
1487             } // if (check CAS result)
1488             KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1489           } // while (try to steal from particular victim)
1490         } // while (search for victim)
1491       } // if (try to find victim and steal)
1492     } // if (4-byte induction variable)
1493     if (!status) {
1494       *p_lb = 0;
1495       *p_ub = 0;
1496       if (p_st != NULL)
1497         *p_st = 0;
1498     } else {
1499       start = pr->u.p.lb;
1500       init *= chunk;
1501       limit = chunk + init - 1;
1502       incr = pr->u.p.st;
1503       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1504 
1505       KMP_DEBUG_ASSERT(init <= trip);
1506       // keep track of done chunks for possible early exit from stealing
1507       // TODO: count executed chunks locally with rare update of shared location
1508       // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1509       if ((last = (limit >= trip)) != 0)
1510         limit = trip;
1511       if (p_st != NULL)
1512         *p_st = incr;
1513 
1514       if (incr == 1) {
1515         *p_lb = start + init;
1516         *p_ub = start + limit;
1517       } else {
1518         *p_lb = start + init * incr;
1519         *p_ub = start + limit * incr;
1520       }
1521     } // if
1522     break;
1523   } // case
1524 #endif // KMP_STATIC_STEAL_ENABLED
1525   case kmp_sch_static_balanced: {
1526     KD_TRACE(
1527         10,
1528         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1529          gtid));
1530     /* check if thread has any iteration to do */
1531     if ((status = !pr->u.p.count) != 0) {
1532       pr->u.p.count = 1;
1533       *p_lb = pr->u.p.lb;
1534       *p_ub = pr->u.p.ub;
1535       last = (pr->u.p.parm1 != 0);
1536       if (p_st != NULL)
1537         *p_st = pr->u.p.st;
1538     } else { /* no iterations to do */
1539       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1540     }
1541   } // case
1542   break;
1543   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1544                                  merged here */
1545   case kmp_sch_static_chunked: {
1546     T parm1;
1547 
1548     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1549                    "kmp_sch_static_[affinity|chunked] case\n",
1550                    gtid));
1551     parm1 = pr->u.p.parm1;
1552 
1553     trip = pr->u.p.tc - 1;
1554     init = parm1 * (pr->u.p.count + tid);
1555 
1556     if ((status = (init <= trip)) != 0) {
1557       start = pr->u.p.lb;
1558       incr = pr->u.p.st;
1559       limit = parm1 + init - 1;
1560 
1561       if ((last = (limit >= trip)) != 0)
1562         limit = trip;
1563 
1564       if (p_st != NULL)
1565         *p_st = incr;
1566 
1567       pr->u.p.count += nproc;
1568 
1569       if (incr == 1) {
1570         *p_lb = start + init;
1571         *p_ub = start + limit;
1572       } else {
1573         *p_lb = start + init * incr;
1574         *p_ub = start + limit * incr;
1575       }
1576 
1577       if (pr->flags.ordered) {
1578         pr->u.p.ordered_lower = init;
1579         pr->u.p.ordered_upper = limit;
1580       } // if
1581     } // if
1582   } // case
1583   break;
1584 
1585   case kmp_sch_dynamic_chunked: {
1586     UT chunk_number;
1587     UT chunk_size = pr->u.p.parm1;
1588     UT nchunks = pr->u.p.parm2;
1589 
1590     KD_TRACE(
1591         100,
1592         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1593          gtid));
1594 
1595     chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1596     status = (chunk_number < nchunks);
1597     if (!status) {
1598       *p_lb = 0;
1599       *p_ub = 0;
1600       if (p_st != NULL)
1601         *p_st = 0;
1602     } else {
1603       init = chunk_size * chunk_number;
1604       trip = pr->u.p.tc - 1;
1605       start = pr->u.p.lb;
1606       incr = pr->u.p.st;
1607 
1608       if ((last = (trip - init < (UT)chunk_size)))
1609         limit = trip;
1610       else
1611         limit = chunk_size + init - 1;
1612 
1613       if (p_st != NULL)
1614         *p_st = incr;
1615 
1616       if (incr == 1) {
1617         *p_lb = start + init;
1618         *p_ub = start + limit;
1619       } else {
1620         *p_lb = start + init * incr;
1621         *p_ub = start + limit * incr;
1622       }
1623 
1624       if (pr->flags.ordered) {
1625         pr->u.p.ordered_lower = init;
1626         pr->u.p.ordered_upper = limit;
1627       } // if
1628     } // if
1629   } // case
1630   break;
1631 
1632   case kmp_sch_guided_iterative_chunked: {
1633     T chunkspec = pr->u.p.parm1;
1634     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1635                    "iterative case\n",
1636                    gtid));
1637     trip = pr->u.p.tc;
1638     // Start atomic part of calculations
1639     while (1) {
1640       ST remaining; // signed, because can be < 0
1641       init = sh->u.s.iteration; // shared value
1642       remaining = trip - init;
1643       if (remaining <= 0) { // AC: need to compare with 0 first
1644         // nothing to do, don't try atomic op
1645         status = 0;
1646         break;
1647       }
1648       if ((T)remaining <
1649           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1650         // use dynamic-style schedule
1651         // atomically increment iterations, get old value
1652         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1653                                  (ST)chunkspec);
1654         remaining = trip - init;
1655         if (remaining <= 0) {
1656           status = 0; // all iterations got by other threads
1657         } else {
1658           // got some iterations to work on
1659           status = 1;
1660           if ((T)remaining > chunkspec) {
1661             limit = init + chunkspec - 1;
1662           } else {
1663             last = true; // the last chunk
1664             limit = init + remaining - 1;
1665           } // if
1666         } // if
1667         break;
1668       } // if
1669       limit = init + (UT)((double)remaining *
1670                           *(double *)&pr->u.p.parm3); // divide by K*nproc
1671       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1672                                (ST)init, (ST)limit)) {
1673         // CAS was successful, chunk obtained
1674         status = 1;
1675         --limit;
1676         break;
1677       } // if
1678     } // while
1679     if (status != 0) {
1680       start = pr->u.p.lb;
1681       incr = pr->u.p.st;
1682       if (p_st != NULL)
1683         *p_st = incr;
1684       *p_lb = start + init * incr;
1685       *p_ub = start + limit * incr;
1686       if (pr->flags.ordered) {
1687         pr->u.p.ordered_lower = init;
1688         pr->u.p.ordered_upper = limit;
1689       } // if
1690     } else {
1691       *p_lb = 0;
1692       *p_ub = 0;
1693       if (p_st != NULL)
1694         *p_st = 0;
1695     } // if
1696   } // case
1697   break;
1698 
1699   case kmp_sch_guided_simd: {
1700     // same as iterative but curr-chunk adjusted to be multiple of given
1701     // chunk
1702     T chunk = pr->u.p.parm1;
1703     KD_TRACE(100,
1704              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1705               gtid));
1706     trip = pr->u.p.tc;
1707     // Start atomic part of calculations
1708     while (1) {
1709       ST remaining; // signed, because can be < 0
1710       init = sh->u.s.iteration; // shared value
1711       remaining = trip - init;
1712       if (remaining <= 0) { // AC: need to compare with 0 first
1713         status = 0; // nothing to do, don't try atomic op
1714         break;
1715       }
1716       KMP_DEBUG_ASSERT(init % chunk == 0);
1717       // compare with K*nproc*(chunk+1), K=2 by default
1718       if ((T)remaining < pr->u.p.parm2) {
1719         // use dynamic-style schedule
1720         // atomically increment iterations, get old value
1721         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1722                                  (ST)chunk);
1723         remaining = trip - init;
1724         if (remaining <= 0) {
1725           status = 0; // all iterations got by other threads
1726         } else {
1727           // got some iterations to work on
1728           status = 1;
1729           if ((T)remaining > chunk) {
1730             limit = init + chunk - 1;
1731           } else {
1732             last = true; // the last chunk
1733             limit = init + remaining - 1;
1734           } // if
1735         } // if
1736         break;
1737       } // if
1738       // divide by K*nproc
1739       UT span;
1740       __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1741                          &span);
1742       UT rem = span % chunk;
1743       if (rem) // adjust so that span%chunk == 0
1744         span += chunk - rem;
1745       limit = init + span;
1746       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1747                                (ST)init, (ST)limit)) {
1748         // CAS was successful, chunk obtained
1749         status = 1;
1750         --limit;
1751         break;
1752       } // if
1753     } // while
1754     if (status != 0) {
1755       start = pr->u.p.lb;
1756       incr = pr->u.p.st;
1757       if (p_st != NULL)
1758         *p_st = incr;
1759       *p_lb = start + init * incr;
1760       *p_ub = start + limit * incr;
1761       if (pr->flags.ordered) {
1762         pr->u.p.ordered_lower = init;
1763         pr->u.p.ordered_upper = limit;
1764       } // if
1765     } else {
1766       *p_lb = 0;
1767       *p_ub = 0;
1768       if (p_st != NULL)
1769         *p_st = 0;
1770     } // if
1771   } // case
1772   break;
1773 
1774   case kmp_sch_guided_analytical_chunked: {
1775     T chunkspec = pr->u.p.parm1;
1776     UT chunkIdx;
1777 #if KMP_USE_X87CONTROL
1778     /* for storing original FPCW value for Windows* OS on
1779        IA-32 architecture 8-byte version */
1780     unsigned int oldFpcw;
1781     unsigned int fpcwSet = 0;
1782 #endif
1783     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1784                    "kmp_sch_guided_analytical_chunked case\n",
1785                    gtid));
1786 
1787     trip = pr->u.p.tc;
1788 
1789     KMP_DEBUG_ASSERT(nproc > 1);
1790     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1791 
1792     while (1) { /* this while loop is a safeguard against unexpected zero
1793                    chunk sizes */
1794       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1795       if (chunkIdx >= (UT)pr->u.p.parm2) {
1796         --trip;
1797         /* use dynamic-style scheduling */
1798         init = chunkIdx * chunkspec + pr->u.p.count;
1799         /* need to verify init > 0 in case of overflow in the above
1800          * calculation */
1801         if ((status = (init > 0 && init <= trip)) != 0) {
1802           limit = init + chunkspec - 1;
1803 
1804           if ((last = (limit >= trip)) != 0)
1805             limit = trip;
1806         }
1807         break;
1808       } else {
1809 /* use exponential-style scheduling */
1810 /* The following check is to workaround the lack of long double precision on
1811    Windows* OS.
1812    This check works around the possible effect that init != 0 for chunkIdx == 0.
1813  */
1814 #if KMP_USE_X87CONTROL
1815         /* If we haven't already done so, save original
1816            FPCW and set precision to 64-bit, as Windows* OS
1817            on IA-32 architecture defaults to 53-bit */
1818         if (!fpcwSet) {
1819           oldFpcw = _control87(0, 0);
1820           _control87(_PC_64, _MCW_PC);
1821           fpcwSet = 0x30000;
1822         }
1823 #endif
1824         if (chunkIdx) {
1825           init = __kmp_dispatch_guided_remaining<T>(
1826               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1827           KMP_DEBUG_ASSERT(init);
1828           init = trip - init;
1829         } else
1830           init = 0;
1831         limit = trip - __kmp_dispatch_guided_remaining<T>(
1832                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1833         KMP_ASSERT(init <= limit);
1834         if (init < limit) {
1835           KMP_DEBUG_ASSERT(limit <= trip);
1836           --limit;
1837           status = 1;
1838           break;
1839         } // if
1840       } // if
1841     } // while (1)
1842 #if KMP_USE_X87CONTROL
1843     /* restore FPCW if necessary
1844        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1845     */
1846     if (fpcwSet && (oldFpcw & fpcwSet))
1847       _control87(oldFpcw, _MCW_PC);
1848 #endif
1849     if (status != 0) {
1850       start = pr->u.p.lb;
1851       incr = pr->u.p.st;
1852       if (p_st != NULL)
1853         *p_st = incr;
1854       *p_lb = start + init * incr;
1855       *p_ub = start + limit * incr;
1856       if (pr->flags.ordered) {
1857         pr->u.p.ordered_lower = init;
1858         pr->u.p.ordered_upper = limit;
1859       }
1860     } else {
1861       *p_lb = 0;
1862       *p_ub = 0;
1863       if (p_st != NULL)
1864         *p_st = 0;
1865     }
1866   } // case
1867   break;
1868 
1869   case kmp_sch_trapezoidal: {
1870     UT index;
1871     T parm2 = pr->u.p.parm2;
1872     T parm3 = pr->u.p.parm3;
1873     T parm4 = pr->u.p.parm4;
1874     KD_TRACE(100,
1875              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1876               gtid));
1877 
1878     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1879 
1880     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1881     trip = pr->u.p.tc - 1;
1882 
1883     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1884       *p_lb = 0;
1885       *p_ub = 0;
1886       if (p_st != NULL)
1887         *p_st = 0;
1888     } else {
1889       start = pr->u.p.lb;
1890       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1891       incr = pr->u.p.st;
1892 
1893       if ((last = (limit >= trip)) != 0)
1894         limit = trip;
1895 
1896       if (p_st != NULL)
1897         *p_st = incr;
1898 
1899       if (incr == 1) {
1900         *p_lb = start + init;
1901         *p_ub = start + limit;
1902       } else {
1903         *p_lb = start + init * incr;
1904         *p_ub = start + limit * incr;
1905       }
1906 
1907       if (pr->flags.ordered) {
1908         pr->u.p.ordered_lower = init;
1909         pr->u.p.ordered_upper = limit;
1910       } // if
1911     } // if
1912   } // case
1913   break;
1914   default: {
1915     status = 0; // to avoid complaints on uninitialized variable use
1916     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1917                 KMP_HNT(GetNewerLibrary), // Hint
1918                 __kmp_msg_null // Variadic argument list terminator
1919     );
1920   } break;
1921   } // switch
1922   if (p_last)
1923     *p_last = last;
1924 #ifdef KMP_DEBUG
1925   if (pr->flags.ordered) {
1926     char *buff;
1927     // create format specifiers before the debug output
1928     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1929                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1930                             traits_t<UT>::spec, traits_t<UT>::spec);
1931     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1932     __kmp_str_free(&buff);
1933   }
1934   {
1935     char *buff;
1936     // create format specifiers before the debug output
1937     buff = __kmp_str_format(
1938         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1939         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1940         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1941     KMP_DEBUG_ASSERT(p_last);
1942     KMP_DEBUG_ASSERT(p_st);
1943     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1944     __kmp_str_free(&buff);
1945   }
1946 #endif
1947   return status;
1948 }
1949 
1950 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1951    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1952    is not called. */
1953 #if OMPT_SUPPORT && OMPT_OPTIONAL
1954 #define OMPT_LOOP_END                                                          \
1955   if (status == 0) {                                                           \
1956     if (ompt_enabled.ompt_callback_work) {                                     \
1957       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1958       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1959       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1960           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1961           &(task_info->task_data), 0, codeptr);                                \
1962     }                                                                          \
1963   }
1964 // TODO: implement count
1965 #else
1966 #define OMPT_LOOP_END // no-op
1967 #endif
1968 
1969 #if KMP_STATS_ENABLED
1970 #define KMP_STATS_LOOP_END                                                     \
1971   {                                                                            \
1972     kmp_int64 u, l, t, i;                                                      \
1973     l = (kmp_int64)(*p_lb);                                                    \
1974     u = (kmp_int64)(*p_ub);                                                    \
1975     i = (kmp_int64)(pr->u.p.st);                                               \
1976     if (status == 0) {                                                         \
1977       t = 0;                                                                   \
1978       KMP_POP_PARTITIONED_TIMER();                                             \
1979     } else if (i == 1) {                                                       \
1980       if (u >= l)                                                              \
1981         t = u - l + 1;                                                         \
1982       else                                                                     \
1983         t = 0;                                                                 \
1984     } else if (i < 0) {                                                        \
1985       if (l >= u)                                                              \
1986         t = (l - u) / (-i) + 1;                                                \
1987       else                                                                     \
1988         t = 0;                                                                 \
1989     } else {                                                                   \
1990       if (u >= l)                                                              \
1991         t = (u - l) / i + 1;                                                   \
1992       else                                                                     \
1993         t = 0;                                                                 \
1994     }                                                                          \
1995     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1996   }
1997 #else
1998 #define KMP_STATS_LOOP_END /* Nothing */
1999 #endif
2000 
2001 template <typename T>
2002 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
2003                                T *p_lb, T *p_ub,
2004                                typename traits_t<T>::signed_t *p_st
2005 #if OMPT_SUPPORT && OMPT_OPTIONAL
2006                                ,
2007                                void *codeptr
2008 #endif
2009 ) {
2010 
2011   typedef typename traits_t<T>::unsigned_t UT;
2012   typedef typename traits_t<T>::signed_t ST;
2013   // This is potentially slightly misleading, schedule(runtime) will appear here
2014   // even if the actual runtime schedule is static. (Which points out a
2015   // disadvantage of schedule(runtime): even when static scheduling is used it
2016   // costs more than a compile time choice to use static scheduling would.)
2017   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2018 
2019   int status;
2020   dispatch_private_info_template<T> *pr;
2021   __kmp_assert_valid_gtid(gtid);
2022   kmp_info_t *th = __kmp_threads[gtid];
2023   kmp_team_t *team = th->th.th_team;
2024 
2025   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2026   KD_TRACE(
2027       1000,
2028       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2029        gtid, p_lb, p_ub, p_st, p_last));
2030 
2031   if (team->t.t_serialized) {
2032     /* NOTE: serialize this dispatch because we are not at the active level */
2033     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2034         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
2035     KMP_DEBUG_ASSERT(pr);
2036 
2037     if ((status = (pr->u.p.tc != 0)) == 0) {
2038       *p_lb = 0;
2039       *p_ub = 0;
2040       //            if ( p_last != NULL )
2041       //                *p_last = 0;
2042       if (p_st != NULL)
2043         *p_st = 0;
2044       if (__kmp_env_consistency_check) {
2045         if (pr->pushed_ws != ct_none) {
2046           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2047         }
2048       }
2049     } else if (pr->flags.nomerge) {
2050       kmp_int32 last;
2051       T start;
2052       UT limit, trip, init;
2053       ST incr;
2054       T chunk = pr->u.p.parm1;
2055 
2056       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2057                      gtid));
2058 
2059       init = chunk * pr->u.p.count++;
2060       trip = pr->u.p.tc - 1;
2061 
2062       if ((status = (init <= trip)) == 0) {
2063         *p_lb = 0;
2064         *p_ub = 0;
2065         //                if ( p_last != NULL )
2066         //                    *p_last = 0;
2067         if (p_st != NULL)
2068           *p_st = 0;
2069         if (__kmp_env_consistency_check) {
2070           if (pr->pushed_ws != ct_none) {
2071             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2072           }
2073         }
2074       } else {
2075         start = pr->u.p.lb;
2076         limit = chunk + init - 1;
2077         incr = pr->u.p.st;
2078 
2079         if ((last = (limit >= trip)) != 0) {
2080           limit = trip;
2081 #if KMP_OS_WINDOWS
2082           pr->u.p.last_upper = pr->u.p.ub;
2083 #endif /* KMP_OS_WINDOWS */
2084         }
2085         if (p_last != NULL)
2086           *p_last = last;
2087         if (p_st != NULL)
2088           *p_st = incr;
2089         if (incr == 1) {
2090           *p_lb = start + init;
2091           *p_ub = start + limit;
2092         } else {
2093           *p_lb = start + init * incr;
2094           *p_ub = start + limit * incr;
2095         }
2096 
2097         if (pr->flags.ordered) {
2098           pr->u.p.ordered_lower = init;
2099           pr->u.p.ordered_upper = limit;
2100 #ifdef KMP_DEBUG
2101           {
2102             char *buff;
2103             // create format specifiers before the debug output
2104             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2105                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
2106                                     traits_t<UT>::spec, traits_t<UT>::spec);
2107             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2108                             pr->u.p.ordered_upper));
2109             __kmp_str_free(&buff);
2110           }
2111 #endif
2112         } // if
2113       } // if
2114     } else {
2115       pr->u.p.tc = 0;
2116       *p_lb = pr->u.p.lb;
2117       *p_ub = pr->u.p.ub;
2118 #if KMP_OS_WINDOWS
2119       pr->u.p.last_upper = *p_ub;
2120 #endif /* KMP_OS_WINDOWS */
2121       if (p_last != NULL)
2122         *p_last = TRUE;
2123       if (p_st != NULL)
2124         *p_st = pr->u.p.st;
2125     } // if
2126 #ifdef KMP_DEBUG
2127     {
2128       char *buff;
2129       // create format specifiers before the debug output
2130       buff = __kmp_str_format(
2131           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2132           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2133           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2134       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2135                     (p_last ? *p_last : 0), status));
2136       __kmp_str_free(&buff);
2137     }
2138 #endif
2139 #if INCLUDE_SSC_MARKS
2140     SSC_MARK_DISPATCH_NEXT();
2141 #endif
2142     OMPT_LOOP_END;
2143     KMP_STATS_LOOP_END;
2144     return status;
2145   } else {
2146     kmp_int32 last = 0;
2147     dispatch_shared_info_template<T> volatile *sh;
2148 
2149     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2150                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2151 
2152     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2153         th->th.th_dispatch->th_dispatch_pr_current);
2154     KMP_DEBUG_ASSERT(pr);
2155     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2156         th->th.th_dispatch->th_dispatch_sh_current);
2157     KMP_DEBUG_ASSERT(sh);
2158 
2159 #if KMP_USE_HIER_SCHED
2160     if (pr->flags.use_hier)
2161       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2162     else
2163 #endif // KMP_USE_HIER_SCHED
2164       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2165                                                 p_st, th->th.th_team_nproc,
2166                                                 th->th.th_info.ds.ds_tid);
2167     // status == 0: no more iterations to execute
2168     if (status == 0) {
2169       ST num_done;
2170       num_done = test_then_inc<ST>(&sh->u.s.num_done);
2171 #ifdef KMP_DEBUG
2172       {
2173         char *buff;
2174         // create format specifiers before the debug output
2175         buff = __kmp_str_format(
2176             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2177             traits_t<ST>::spec);
2178         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2179         __kmp_str_free(&buff);
2180       }
2181 #endif
2182 
2183 #if KMP_USE_HIER_SCHED
2184       pr->flags.use_hier = FALSE;
2185 #endif
2186       if (num_done == th->th.th_team_nproc - 1) {
2187 #if KMP_STATIC_STEAL_ENABLED
2188         if (pr->schedule == kmp_sch_static_steal) {
2189           int i;
2190           int idx = (th->th.th_dispatch->th_disp_index - 1) %
2191                     __kmp_dispatch_num_buffers; // current loop index
2192           // loop complete, safe to destroy locks used for stealing
2193           for (i = 0; i < th->th.th_team_nproc; ++i) {
2194             dispatch_private_info_template<T> *buf =
2195                 reinterpret_cast<dispatch_private_info_template<T> *>(
2196                     &team->t.t_dispatch[i].th_disp_buffer[idx]);
2197             KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2198             KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2199             if (traits_t<T>::type_size > 4) {
2200               // destroy locks used for stealing
2201               kmp_lock_t *lck = buf->u.p.steal_lock;
2202               KMP_ASSERT(lck != NULL);
2203               __kmp_destroy_lock(lck);
2204               __kmp_free(lck);
2205               buf->u.p.steal_lock = NULL;
2206             }
2207           }
2208         }
2209 #endif
2210         /* NOTE: release shared buffer to be reused */
2211 
2212         KMP_MB(); /* Flush all pending memory write invalidates.  */
2213 
2214         sh->u.s.num_done = 0;
2215         sh->u.s.iteration = 0;
2216 
2217         /* TODO replace with general release procedure? */
2218         if (pr->flags.ordered) {
2219           sh->u.s.ordered_iteration = 0;
2220         }
2221 
2222         sh->buffer_index += __kmp_dispatch_num_buffers;
2223         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2224                        gtid, sh->buffer_index));
2225 
2226         KMP_MB(); /* Flush all pending memory write invalidates.  */
2227 
2228       } // if
2229       if (__kmp_env_consistency_check) {
2230         if (pr->pushed_ws != ct_none) {
2231           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2232         }
2233       }
2234 
2235       th->th.th_dispatch->th_deo_fcn = NULL;
2236       th->th.th_dispatch->th_dxo_fcn = NULL;
2237       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2238       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2239     } // if (status == 0)
2240 #if KMP_OS_WINDOWS
2241     else if (last) {
2242       pr->u.p.last_upper = pr->u.p.ub;
2243     }
2244 #endif /* KMP_OS_WINDOWS */
2245     if (p_last != NULL && status != 0)
2246       *p_last = last;
2247   } // if
2248 
2249 #ifdef KMP_DEBUG
2250   {
2251     char *buff;
2252     // create format specifiers before the debug output
2253     buff = __kmp_str_format(
2254         "__kmp_dispatch_next: T#%%d normal case: "
2255         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2256         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2257     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2258                   (p_last ? *p_last : 0), status));
2259     __kmp_str_free(&buff);
2260   }
2261 #endif
2262 #if INCLUDE_SSC_MARKS
2263   SSC_MARK_DISPATCH_NEXT();
2264 #endif
2265   OMPT_LOOP_END;
2266   KMP_STATS_LOOP_END;
2267   return status;
2268 }
2269 
2270 template <typename T>
2271 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2272                                   kmp_int32 *plastiter, T *plower, T *pupper,
2273                                   typename traits_t<T>::signed_t incr) {
2274   typedef typename traits_t<T>::unsigned_t UT;
2275   kmp_uint32 team_id;
2276   kmp_uint32 nteams;
2277   UT trip_count;
2278   kmp_team_t *team;
2279   kmp_info_t *th;
2280 
2281   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2282   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2283 #ifdef KMP_DEBUG
2284   typedef typename traits_t<T>::signed_t ST;
2285   {
2286     char *buff;
2287     // create format specifiers before the debug output
2288     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2289                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2290                             traits_t<T>::spec, traits_t<T>::spec,
2291                             traits_t<ST>::spec, traits_t<T>::spec);
2292     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2293     __kmp_str_free(&buff);
2294   }
2295 #endif
2296 
2297   if (__kmp_env_consistency_check) {
2298     if (incr == 0) {
2299       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2300                             loc);
2301     }
2302     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2303       // The loop is illegal.
2304       // Some zero-trip loops maintained by compiler, e.g.:
2305       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2306       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2307       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2308       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2309       // Compiler does not check the following illegal loops:
2310       //   for(i=0;i<10;i+=incr) // where incr<0
2311       //   for(i=10;i>0;i-=incr) // where incr<0
2312       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2313     }
2314   }
2315   __kmp_assert_valid_gtid(gtid);
2316   th = __kmp_threads[gtid];
2317   team = th->th.th_team;
2318   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2319   nteams = th->th.th_teams_size.nteams;
2320   team_id = team->t.t_master_tid;
2321   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2322 
2323   // compute global trip count
2324   if (incr == 1) {
2325     trip_count = *pupper - *plower + 1;
2326   } else if (incr == -1) {
2327     trip_count = *plower - *pupper + 1;
2328   } else if (incr > 0) {
2329     // upper-lower can exceed the limit of signed type
2330     trip_count = (UT)(*pupper - *plower) / incr + 1;
2331   } else {
2332     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2333   }
2334 
2335   if (trip_count <= nteams) {
2336     KMP_DEBUG_ASSERT(
2337         __kmp_static == kmp_sch_static_greedy ||
2338         __kmp_static ==
2339             kmp_sch_static_balanced); // Unknown static scheduling type.
2340     // only some teams get single iteration, others get nothing
2341     if (team_id < trip_count) {
2342       *pupper = *plower = *plower + team_id * incr;
2343     } else {
2344       *plower = *pupper + incr; // zero-trip loop
2345     }
2346     if (plastiter != NULL)
2347       *plastiter = (team_id == trip_count - 1);
2348   } else {
2349     if (__kmp_static == kmp_sch_static_balanced) {
2350       UT chunk = trip_count / nteams;
2351       UT extras = trip_count % nteams;
2352       *plower +=
2353           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2354       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2355       if (plastiter != NULL)
2356         *plastiter = (team_id == nteams - 1);
2357     } else {
2358       T chunk_inc_count =
2359           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2360       T upper = *pupper;
2361       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2362       // Unknown static scheduling type.
2363       *plower += team_id * chunk_inc_count;
2364       *pupper = *plower + chunk_inc_count - incr;
2365       // Check/correct bounds if needed
2366       if (incr > 0) {
2367         if (*pupper < *plower)
2368           *pupper = traits_t<T>::max_value;
2369         if (plastiter != NULL)
2370           *plastiter = *plower <= upper && *pupper > upper - incr;
2371         if (*pupper > upper)
2372           *pupper = upper; // tracker C73258
2373       } else {
2374         if (*pupper > *plower)
2375           *pupper = traits_t<T>::min_value;
2376         if (plastiter != NULL)
2377           *plastiter = *plower >= upper && *pupper < upper - incr;
2378         if (*pupper < upper)
2379           *pupper = upper; // tracker C73258
2380       }
2381     }
2382   }
2383 }
2384 
2385 //-----------------------------------------------------------------------------
2386 // Dispatch routines
2387 //    Transfer call to template< type T >
2388 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2389 //                         T lb, T ub, ST st, ST chunk )
2390 extern "C" {
2391 
2392 /*!
2393 @ingroup WORK_SHARING
2394 @{
2395 @param loc Source location
2396 @param gtid Global thread id
2397 @param schedule Schedule type
2398 @param lb  Lower bound
2399 @param ub  Upper bound
2400 @param st  Step (or increment if you prefer)
2401 @param chunk The chunk size to block with
2402 
2403 This function prepares the runtime to start a dynamically scheduled for loop,
2404 saving the loop arguments.
2405 These functions are all identical apart from the types of the arguments.
2406 */
2407 
2408 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2409                             enum sched_type schedule, kmp_int32 lb,
2410                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2411   KMP_DEBUG_ASSERT(__kmp_init_serial);
2412 #if OMPT_SUPPORT && OMPT_OPTIONAL
2413   OMPT_STORE_RETURN_ADDRESS(gtid);
2414 #endif
2415   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2416 }
2417 /*!
2418 See @ref __kmpc_dispatch_init_4
2419 */
2420 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2421                              enum sched_type schedule, kmp_uint32 lb,
2422                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2423   KMP_DEBUG_ASSERT(__kmp_init_serial);
2424 #if OMPT_SUPPORT && OMPT_OPTIONAL
2425   OMPT_STORE_RETURN_ADDRESS(gtid);
2426 #endif
2427   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2428 }
2429 
2430 /*!
2431 See @ref __kmpc_dispatch_init_4
2432 */
2433 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2434                             enum sched_type schedule, kmp_int64 lb,
2435                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2436   KMP_DEBUG_ASSERT(__kmp_init_serial);
2437 #if OMPT_SUPPORT && OMPT_OPTIONAL
2438   OMPT_STORE_RETURN_ADDRESS(gtid);
2439 #endif
2440   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2441 }
2442 
2443 /*!
2444 See @ref __kmpc_dispatch_init_4
2445 */
2446 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2447                              enum sched_type schedule, kmp_uint64 lb,
2448                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2449   KMP_DEBUG_ASSERT(__kmp_init_serial);
2450 #if OMPT_SUPPORT && OMPT_OPTIONAL
2451   OMPT_STORE_RETURN_ADDRESS(gtid);
2452 #endif
2453   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2454 }
2455 
2456 /*!
2457 See @ref __kmpc_dispatch_init_4
2458 
2459 Difference from __kmpc_dispatch_init set of functions is these functions
2460 are called for composite distribute parallel for construct. Thus before
2461 regular iterations dispatching we need to calc per-team iteration space.
2462 
2463 These functions are all identical apart from the types of the arguments.
2464 */
2465 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2466                                  enum sched_type schedule, kmp_int32 *p_last,
2467                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2468                                  kmp_int32 chunk) {
2469   KMP_DEBUG_ASSERT(__kmp_init_serial);
2470 #if OMPT_SUPPORT && OMPT_OPTIONAL
2471   OMPT_STORE_RETURN_ADDRESS(gtid);
2472 #endif
2473   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2474   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2475 }
2476 
2477 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2478                                   enum sched_type schedule, kmp_int32 *p_last,
2479                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2480                                   kmp_int32 chunk) {
2481   KMP_DEBUG_ASSERT(__kmp_init_serial);
2482 #if OMPT_SUPPORT && OMPT_OPTIONAL
2483   OMPT_STORE_RETURN_ADDRESS(gtid);
2484 #endif
2485   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2486   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2487 }
2488 
2489 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2490                                  enum sched_type schedule, kmp_int32 *p_last,
2491                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2492                                  kmp_int64 chunk) {
2493   KMP_DEBUG_ASSERT(__kmp_init_serial);
2494 #if OMPT_SUPPORT && OMPT_OPTIONAL
2495   OMPT_STORE_RETURN_ADDRESS(gtid);
2496 #endif
2497   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2498   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2499 }
2500 
2501 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2502                                   enum sched_type schedule, kmp_int32 *p_last,
2503                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2504                                   kmp_int64 chunk) {
2505   KMP_DEBUG_ASSERT(__kmp_init_serial);
2506 #if OMPT_SUPPORT && OMPT_OPTIONAL
2507   OMPT_STORE_RETURN_ADDRESS(gtid);
2508 #endif
2509   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2510   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2511 }
2512 
2513 /*!
2514 @param loc Source code location
2515 @param gtid Global thread id
2516 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2517 otherwise
2518 @param p_lb   Pointer to the lower bound for the next chunk of work
2519 @param p_ub   Pointer to the upper bound for the next chunk of work
2520 @param p_st   Pointer to the stride for the next chunk of work
2521 @return one if there is work to be done, zero otherwise
2522 
2523 Get the next dynamically allocated chunk of work for this thread.
2524 If there is no more work, then the lb,ub and stride need not be modified.
2525 */
2526 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2527                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2528 #if OMPT_SUPPORT && OMPT_OPTIONAL
2529   OMPT_STORE_RETURN_ADDRESS(gtid);
2530 #endif
2531   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2532 #if OMPT_SUPPORT && OMPT_OPTIONAL
2533                                         ,
2534                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2535 #endif
2536   );
2537 }
2538 
2539 /*!
2540 See @ref __kmpc_dispatch_next_4
2541 */
2542 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2543                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2544                             kmp_int32 *p_st) {
2545 #if OMPT_SUPPORT && OMPT_OPTIONAL
2546   OMPT_STORE_RETURN_ADDRESS(gtid);
2547 #endif
2548   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2549 #if OMPT_SUPPORT && OMPT_OPTIONAL
2550                                          ,
2551                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2552 #endif
2553   );
2554 }
2555 
2556 /*!
2557 See @ref __kmpc_dispatch_next_4
2558 */
2559 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2560                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2561 #if OMPT_SUPPORT && OMPT_OPTIONAL
2562   OMPT_STORE_RETURN_ADDRESS(gtid);
2563 #endif
2564   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2565 #if OMPT_SUPPORT && OMPT_OPTIONAL
2566                                         ,
2567                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2568 #endif
2569   );
2570 }
2571 
2572 /*!
2573 See @ref __kmpc_dispatch_next_4
2574 */
2575 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2576                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2577                             kmp_int64 *p_st) {
2578 #if OMPT_SUPPORT && OMPT_OPTIONAL
2579   OMPT_STORE_RETURN_ADDRESS(gtid);
2580 #endif
2581   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2582 #if OMPT_SUPPORT && OMPT_OPTIONAL
2583                                          ,
2584                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2585 #endif
2586   );
2587 }
2588 
2589 /*!
2590 @param loc Source code location
2591 @param gtid Global thread id
2592 
2593 Mark the end of a dynamic loop.
2594 */
2595 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2596   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2597 }
2598 
2599 /*!
2600 See @ref __kmpc_dispatch_fini_4
2601 */
2602 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2603   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2604 }
2605 
2606 /*!
2607 See @ref __kmpc_dispatch_fini_4
2608 */
2609 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2610   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2611 }
2612 
2613 /*!
2614 See @ref __kmpc_dispatch_fini_4
2615 */
2616 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2617   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2618 }
2619 /*! @} */
2620 
2621 //-----------------------------------------------------------------------------
2622 // Non-template routines from kmp_dispatch.cpp used in other sources
2623 
2624 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2625   return value == checker;
2626 }
2627 
2628 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2629   return value != checker;
2630 }
2631 
2632 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2633   return value < checker;
2634 }
2635 
2636 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2637   return value >= checker;
2638 }
2639 
2640 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2641   return value <= checker;
2642 }
2643 
2644 kmp_uint32
2645 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2646              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2647              void *obj // Higher-level synchronization object, or NULL.
2648 ) {
2649   // note: we may not belong to a team at this point
2650   volatile kmp_uint32 *spin = spinner;
2651   kmp_uint32 check = checker;
2652   kmp_uint32 spins;
2653   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2654   kmp_uint32 r;
2655 
2656   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2657   KMP_INIT_YIELD(spins);
2658   // main wait spin loop
2659   while (!f(r = TCR_4(*spin), check)) {
2660     KMP_FSYNC_SPIN_PREPARE(obj);
2661     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2662        split. It causes problems with infinite recursion because of exit lock */
2663     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2664         __kmp_abort_thread(); */
2665     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2666   }
2667   KMP_FSYNC_SPIN_ACQUIRED(obj);
2668   return r;
2669 }
2670 
2671 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2672                       kmp_uint32 (*pred)(void *, kmp_uint32),
2673                       void *obj // Higher-level synchronization object, or NULL.
2674 ) {
2675   // note: we may not belong to a team at this point
2676   void *spin = spinner;
2677   kmp_uint32 check = checker;
2678   kmp_uint32 spins;
2679   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2680 
2681   KMP_FSYNC_SPIN_INIT(obj, spin);
2682   KMP_INIT_YIELD(spins);
2683   // main wait spin loop
2684   while (!f(spin, check)) {
2685     KMP_FSYNC_SPIN_PREPARE(obj);
2686     /* if we have waited a bit, or are noversubscribed, yield */
2687     /* pause is in the following code */
2688     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2689   }
2690   KMP_FSYNC_SPIN_ACQUIRED(obj);
2691 }
2692 
2693 } // extern "C"
2694 
2695 #ifdef KMP_GOMP_COMPAT
2696 
2697 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2698                                enum sched_type schedule, kmp_int32 lb,
2699                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2700                                int push_ws) {
2701   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2702                                  push_ws);
2703 }
2704 
2705 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2706                                 enum sched_type schedule, kmp_uint32 lb,
2707                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2708                                 int push_ws) {
2709   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2710                                   push_ws);
2711 }
2712 
2713 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2714                                enum sched_type schedule, kmp_int64 lb,
2715                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2716                                int push_ws) {
2717   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2718                                  push_ws);
2719 }
2720 
2721 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2722                                 enum sched_type schedule, kmp_uint64 lb,
2723                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2724                                 int push_ws) {
2725   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2726                                   push_ws);
2727 }
2728 
2729 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2730   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2731 }
2732 
2733 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2734   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2735 }
2736 
2737 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2738   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2739 }
2740 
2741 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2742   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2743 }
2744 
2745 #endif /* KMP_GOMP_COMPAT */
2746 
2747 /* ------------------------------------------------------------------------ */
2748