xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_dispatch.cpp (revision 79ac3c12a714bcd3f2354c52d948aed9575c46d6)
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73                                          bool use_hier = false) {
74   // Pick up the nonmonotonic/monotonic bits from the scheduling type
75   // TODO: make nonmonotonic when static_steal is fixed
76   int monotonicity = SCHEDULE_MONOTONIC;
77 
78   // Let default be monotonic for executables
79   // compiled with OpenMP* 4.5 or less compilers
80   if (loc->get_openmp_version() < 50)
81     monotonicity = SCHEDULE_MONOTONIC;
82 
83   if (use_hier)
84     monotonicity = SCHEDULE_MONOTONIC;
85   else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86     monotonicity = SCHEDULE_NONMONOTONIC;
87   else if (SCHEDULE_HAS_MONOTONIC(schedule))
88     monotonicity = SCHEDULE_MONOTONIC;
89 
90   return monotonicity;
91 }
92 
93 // Initialize a dispatch_private_info_template<T> buffer for a particular
94 // type of schedule,chunk.  The loop description is found in lb (lower bound),
95 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
96 // to the scheduling (often the number of threads in a team, but not always if
97 // hierarchical scheduling is used).  tid is the id of the thread calling
98 // the function within the group of nproc threads.  It will have a value
99 // between 0 and nproc - 1.  This is often just the thread id within a team, but
100 // is not necessarily the case when using hierarchical scheduling.
101 // loc is the source file location of the corresponding loop
102 // gtid is the global thread id
103 template <typename T>
104 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
105                                    dispatch_private_info_template<T> *pr,
106                                    enum sched_type schedule, T lb, T ub,
107                                    typename traits_t<T>::signed_t st,
108 #if USE_ITT_BUILD
109                                    kmp_uint64 *cur_chunk,
110 #endif
111                                    typename traits_t<T>::signed_t chunk,
112                                    T nproc, T tid) {
113   typedef typename traits_t<T>::unsigned_t UT;
114   typedef typename traits_t<T>::floating_t DBL;
115 
116   int active;
117   T tc;
118   kmp_info_t *th;
119   kmp_team_t *team;
120   int monotonicity;
121   bool use_hier;
122 
123 #ifdef KMP_DEBUG
124   typedef typename traits_t<T>::signed_t ST;
125   {
126     char *buff;
127     // create format specifiers before the debug output
128     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
129                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
130                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
131                             traits_t<T>::spec, traits_t<T>::spec,
132                             traits_t<ST>::spec, traits_t<ST>::spec,
133                             traits_t<T>::spec, traits_t<T>::spec);
134     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
135     __kmp_str_free(&buff);
136   }
137 #endif
138   /* setup data */
139   th = __kmp_threads[gtid];
140   team = th->th.th_team;
141   active = !team->t.t_serialized;
142 
143 #if USE_ITT_BUILD
144   int itt_need_metadata_reporting =
145       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
146       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
147       team->t.t_active_level == 1;
148 #endif
149 
150 #if KMP_USE_HIER_SCHED
151   use_hier = pr->flags.use_hier;
152 #else
153   use_hier = false;
154 #endif
155 
156   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
157   monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
158   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
159 
160   /* Pick up the nomerge/ordered bits from the scheduling type */
161   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
162     pr->flags.nomerge = TRUE;
163     schedule =
164         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
165   } else {
166     pr->flags.nomerge = FALSE;
167   }
168   pr->type_size = traits_t<T>::type_size; // remember the size of variables
169   if (kmp_ord_lower & schedule) {
170     pr->flags.ordered = TRUE;
171     schedule =
172         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
173   } else {
174     pr->flags.ordered = FALSE;
175   }
176   // Ordered overrides nonmonotonic
177   if (pr->flags.ordered) {
178     monotonicity = SCHEDULE_MONOTONIC;
179   }
180 
181   if (schedule == kmp_sch_static) {
182     schedule = __kmp_static;
183   } else {
184     if (schedule == kmp_sch_runtime) {
185       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
186       // not specified)
187       schedule = team->t.t_sched.r_sched_type;
188       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
189       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
190       // Detail the schedule if needed (global controls are differentiated
191       // appropriately)
192       if (schedule == kmp_sch_guided_chunked) {
193         schedule = __kmp_guided;
194       } else if (schedule == kmp_sch_static) {
195         schedule = __kmp_static;
196       }
197       // Use the chunk size specified by OMP_SCHEDULE (or default if not
198       // specified)
199       chunk = team->t.t_sched.chunk;
200 #if USE_ITT_BUILD
201       if (cur_chunk)
202         *cur_chunk = chunk;
203 #endif
204 #ifdef KMP_DEBUG
205       {
206         char *buff;
207         // create format specifiers before the debug output
208         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
209                                 "schedule:%%d chunk:%%%s\n",
210                                 traits_t<ST>::spec);
211         KD_TRACE(10, (buff, gtid, schedule, chunk));
212         __kmp_str_free(&buff);
213       }
214 #endif
215     } else {
216       if (schedule == kmp_sch_guided_chunked) {
217         schedule = __kmp_guided;
218       }
219       if (chunk <= 0) {
220         chunk = KMP_DEFAULT_CHUNK;
221       }
222     }
223 
224     if (schedule == kmp_sch_auto) {
225       // mapping and differentiation: in the __kmp_do_serial_initialize()
226       schedule = __kmp_auto;
227 #ifdef KMP_DEBUG
228       {
229         char *buff;
230         // create format specifiers before the debug output
231         buff = __kmp_str_format(
232             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
233             "schedule:%%d chunk:%%%s\n",
234             traits_t<ST>::spec);
235         KD_TRACE(10, (buff, gtid, schedule, chunk));
236         __kmp_str_free(&buff);
237       }
238 #endif
239     }
240 #if KMP_STATIC_STEAL_ENABLED
241     // map nonmonotonic:dynamic to static steal
242     if (schedule == kmp_sch_dynamic_chunked) {
243       if (monotonicity == SCHEDULE_NONMONOTONIC)
244         schedule = kmp_sch_static_steal;
245     }
246 #endif
247     /* guided analytical not safe for too many threads */
248     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
249       schedule = kmp_sch_guided_iterative_chunked;
250       KMP_WARNING(DispatchManyThreads);
251     }
252     if (schedule == kmp_sch_runtime_simd) {
253       // compiler provides simd_width in the chunk parameter
254       schedule = team->t.t_sched.r_sched_type;
255       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
256       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
257       // Detail the schedule if needed (global controls are differentiated
258       // appropriately)
259       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
260           schedule == __kmp_static) {
261         schedule = kmp_sch_static_balanced_chunked;
262       } else {
263         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
264           schedule = kmp_sch_guided_simd;
265         }
266         chunk = team->t.t_sched.chunk * chunk;
267       }
268 #if USE_ITT_BUILD
269       if (cur_chunk)
270         *cur_chunk = chunk;
271 #endif
272 #ifdef KMP_DEBUG
273       {
274         char *buff;
275         // create format specifiers before the debug output
276         buff = __kmp_str_format(
277             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
278             " chunk:%%%s\n",
279             traits_t<ST>::spec);
280         KD_TRACE(10, (buff, gtid, schedule, chunk));
281         __kmp_str_free(&buff);
282       }
283 #endif
284     }
285     pr->u.p.parm1 = chunk;
286   }
287   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
288               "unknown scheduling type");
289 
290   pr->u.p.count = 0;
291 
292   if (__kmp_env_consistency_check) {
293     if (st == 0) {
294       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
295                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
296     }
297   }
298   // compute trip count
299   if (st == 1) { // most common case
300     if (ub >= lb) {
301       tc = ub - lb + 1;
302     } else { // ub < lb
303       tc = 0; // zero-trip
304     }
305   } else if (st < 0) {
306     if (lb >= ub) {
307       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
308       // where the division needs to be unsigned regardless of the result type
309       tc = (UT)(lb - ub) / (-st) + 1;
310     } else { // lb < ub
311       tc = 0; // zero-trip
312     }
313   } else { // st > 0
314     if (ub >= lb) {
315       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
316       // where the division needs to be unsigned regardless of the result type
317       tc = (UT)(ub - lb) / st + 1;
318     } else { // ub < lb
319       tc = 0; // zero-trip
320     }
321   }
322 
323 #if KMP_STATS_ENABLED
324   if (KMP_MASTER_GTID(gtid)) {
325     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
326   }
327 #endif
328 
329   pr->u.p.lb = lb;
330   pr->u.p.ub = ub;
331   pr->u.p.st = st;
332   pr->u.p.tc = tc;
333 
334 #if KMP_OS_WINDOWS
335   pr->u.p.last_upper = ub + st;
336 #endif /* KMP_OS_WINDOWS */
337 
338   /* NOTE: only the active parallel region(s) has active ordered sections */
339 
340   if (active) {
341     if (pr->flags.ordered) {
342       pr->ordered_bumped = 0;
343       pr->u.p.ordered_lower = 1;
344       pr->u.p.ordered_upper = 0;
345     }
346   }
347 
348   switch (schedule) {
349 #if (KMP_STATIC_STEAL_ENABLED)
350   case kmp_sch_static_steal: {
351     T ntc, init;
352 
353     KD_TRACE(100,
354              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
355               gtid));
356 
357     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
358     if (nproc > 1 && ntc >= nproc) {
359       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
360       T id = tid;
361       T small_chunk, extras;
362 
363       small_chunk = ntc / nproc;
364       extras = ntc % nproc;
365 
366       init = id * small_chunk + (id < extras ? id : extras);
367       pr->u.p.count = init;
368       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
369 
370       pr->u.p.parm2 = lb;
371       // parm3 is the number of times to attempt stealing which is
372       // proportional to the number of chunks per thread up until
373       // the maximum value of nproc.
374       pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
375       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
376       pr->u.p.st = st;
377       if (traits_t<T>::type_size > 4) {
378         // AC: TODO: check if 16-byte CAS available and use it to
379         // improve performance (probably wait for explicit request
380         // before spending time on this).
381         // For now use dynamically allocated per-thread lock,
382         // free memory in __kmp_dispatch_next when status==0.
383         KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
384         pr->u.p.th_steal_lock =
385             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
386         __kmp_init_lock(pr->u.p.th_steal_lock);
387       }
388       break;
389     } else {
390       /* too few chunks: switching to kmp_sch_dynamic_chunked */
391       schedule = kmp_sch_dynamic_chunked;
392       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
393                      "kmp_sch_dynamic_chunked\n",
394                       gtid));
395       if (pr->u.p.parm1 <= 0)
396         pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
397       break;
398     } // if
399   } // case
400 #endif
401   case kmp_sch_static_balanced: {
402     T init, limit;
403 
404     KD_TRACE(
405         100,
406         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
407          gtid));
408 
409     if (nproc > 1) {
410       T id = tid;
411 
412       if (tc < nproc) {
413         if (id < tc) {
414           init = id;
415           limit = id;
416           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
417         } else {
418           pr->u.p.count = 1; /* means no more chunks to execute */
419           pr->u.p.parm1 = FALSE;
420           break;
421         }
422       } else {
423         T small_chunk = tc / nproc;
424         T extras = tc % nproc;
425         init = id * small_chunk + (id < extras ? id : extras);
426         limit = init + small_chunk - (id < extras ? 0 : 1);
427         pr->u.p.parm1 = (id == nproc - 1);
428       }
429     } else {
430       if (tc > 0) {
431         init = 0;
432         limit = tc - 1;
433         pr->u.p.parm1 = TRUE;
434       } else {
435         // zero trip count
436         pr->u.p.count = 1; /* means no more chunks to execute */
437         pr->u.p.parm1 = FALSE;
438         break;
439       }
440     }
441 #if USE_ITT_BUILD
442     // Calculate chunk for metadata report
443     if (itt_need_metadata_reporting)
444       if (cur_chunk)
445         *cur_chunk = limit - init + 1;
446 #endif
447     if (st == 1) {
448       pr->u.p.lb = lb + init;
449       pr->u.p.ub = lb + limit;
450     } else {
451       // calculated upper bound, "ub" is user-defined upper bound
452       T ub_tmp = lb + limit * st;
453       pr->u.p.lb = lb + init * st;
454       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
455       // it exactly
456       if (st > 0) {
457         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
458       } else {
459         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
460       }
461     }
462     if (pr->flags.ordered) {
463       pr->u.p.ordered_lower = init;
464       pr->u.p.ordered_upper = limit;
465     }
466     break;
467   } // case
468   case kmp_sch_static_balanced_chunked: {
469     // similar to balanced, but chunk adjusted to multiple of simd width
470     T nth = nproc;
471     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
472                    " -> falling-through to static_greedy\n",
473                    gtid));
474     schedule = kmp_sch_static_greedy;
475     if (nth > 1)
476       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
477     else
478       pr->u.p.parm1 = tc;
479     break;
480   } // case
481   case kmp_sch_guided_simd:
482   case kmp_sch_guided_iterative_chunked: {
483     KD_TRACE(
484         100,
485         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
486          " case\n",
487          gtid));
488 
489     if (nproc > 1) {
490       if ((2L * chunk + 1) * nproc >= tc) {
491         /* chunk size too large, switch to dynamic */
492         schedule = kmp_sch_dynamic_chunked;
493       } else {
494         // when remaining iters become less than parm2 - switch to dynamic
495         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
496         *(double *)&pr->u.p.parm3 =
497             guided_flt_param / (double)nproc; // may occupy parm3 and parm4
498       }
499     } else {
500       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
501                      "kmp_sch_static_greedy\n",
502                      gtid));
503       schedule = kmp_sch_static_greedy;
504       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
505       KD_TRACE(
506           100,
507           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
508            gtid));
509       pr->u.p.parm1 = tc;
510     } // if
511   } // case
512   break;
513   case kmp_sch_guided_analytical_chunked: {
514     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
515                    "kmp_sch_guided_analytical_chunked case\n",
516                    gtid));
517 
518     if (nproc > 1) {
519       if ((2L * chunk + 1) * nproc >= tc) {
520         /* chunk size too large, switch to dynamic */
521         schedule = kmp_sch_dynamic_chunked;
522       } else {
523         /* commonly used term: (2 nproc - 1)/(2 nproc) */
524         DBL x;
525 
526 #if KMP_USE_X87CONTROL
527         /* Linux* OS already has 64-bit computation by default for long double,
528            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
529            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
530            instead of the default 53-bit. Even though long double doesn't work
531            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
532            expected to impact the correctness of the algorithm, but this has not
533            been mathematically proven. */
534         // save original FPCW and set precision to 64-bit, as
535         // Windows* OS on IA-32 architecture defaults to 53-bit
536         unsigned int oldFpcw = _control87(0, 0);
537         _control87(_PC_64, _MCW_PC); // 0,0x30000
538 #endif
539         /* value used for comparison in solver for cross-over point */
540         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
541 
542         /* crossover point--chunk indexes equal to or greater than
543            this point switch to dynamic-style scheduling */
544         UT cross;
545 
546         /* commonly used term: (2 nproc - 1)/(2 nproc) */
547         x = 1.0 - 0.5 / (double)nproc;
548 
549 #ifdef KMP_DEBUG
550         { // test natural alignment
551           struct _test_a {
552             char a;
553             union {
554               char b;
555               DBL d;
556             };
557           } t;
558           ptrdiff_t natural_alignment =
559               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
560           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
561           // long)natural_alignment );
562           KMP_DEBUG_ASSERT(
563               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
564         }
565 #endif // KMP_DEBUG
566 
567         /* save the term in thread private dispatch structure */
568         *(DBL *)&pr->u.p.parm3 = x;
569 
570         /* solve for the crossover point to the nearest integer i for which C_i
571            <= chunk */
572         {
573           UT left, right, mid;
574           long double p;
575 
576           /* estimate initial upper and lower bound */
577 
578           /* doesn't matter what value right is as long as it is positive, but
579              it affects performance of the solver */
580           right = 229;
581           p = __kmp_pow<UT>(x, right);
582           if (p > target) {
583             do {
584               p *= p;
585               right <<= 1;
586             } while (p > target && right < (1 << 27));
587             /* lower bound is previous (failed) estimate of upper bound */
588             left = right >> 1;
589           } else {
590             left = 0;
591           }
592 
593           /* bisection root-finding method */
594           while (left + 1 < right) {
595             mid = (left + right) / 2;
596             if (__kmp_pow<UT>(x, mid) > target) {
597               left = mid;
598             } else {
599               right = mid;
600             }
601           } // while
602           cross = right;
603         }
604         /* assert sanity of computed crossover point */
605         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
606                    __kmp_pow<UT>(x, cross) <= target);
607 
608         /* save the crossover point in thread private dispatch structure */
609         pr->u.p.parm2 = cross;
610 
611 // C75803
612 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
613 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
614 #else
615 #define GUIDED_ANALYTICAL_WORKAROUND (x)
616 #endif
617         /* dynamic-style scheduling offset */
618         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
619                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
620                         cross * chunk;
621 #if KMP_USE_X87CONTROL
622         // restore FPCW
623         _control87(oldFpcw, _MCW_PC);
624 #endif
625       } // if
626     } else {
627       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
628                      "kmp_sch_static_greedy\n",
629                      gtid));
630       schedule = kmp_sch_static_greedy;
631       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
632       pr->u.p.parm1 = tc;
633     } // if
634   } // case
635   break;
636   case kmp_sch_static_greedy:
637     KD_TRACE(
638         100,
639         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
640          gtid));
641     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
642     break;
643   case kmp_sch_static_chunked:
644   case kmp_sch_dynamic_chunked:
645     if (pr->u.p.parm1 <= 0) {
646       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
647     }
648     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
649                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
650                    gtid));
651     break;
652   case kmp_sch_trapezoidal: {
653     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
654 
655     T parm1, parm2, parm3, parm4;
656     KD_TRACE(100,
657              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
658               gtid));
659 
660     parm1 = chunk;
661 
662     /* F : size of the first cycle */
663     parm2 = (tc / (2 * nproc));
664 
665     if (parm2 < 1) {
666       parm2 = 1;
667     }
668 
669     /* L : size of the last cycle.  Make sure the last cycle is not larger
670        than the first cycle. */
671     if (parm1 < 1) {
672       parm1 = 1;
673     } else if (parm1 > parm2) {
674       parm1 = parm2;
675     }
676 
677     /* N : number of cycles */
678     parm3 = (parm2 + parm1);
679     parm3 = (2 * tc + parm3 - 1) / parm3;
680 
681     if (parm3 < 2) {
682       parm3 = 2;
683     }
684 
685     /* sigma : decreasing incr of the trapezoid */
686     parm4 = (parm3 - 1);
687     parm4 = (parm2 - parm1) / parm4;
688 
689     // pointless check, because parm4 >= 0 always
690     // if ( parm4 < 0 ) {
691     //    parm4 = 0;
692     //}
693 
694     pr->u.p.parm1 = parm1;
695     pr->u.p.parm2 = parm2;
696     pr->u.p.parm3 = parm3;
697     pr->u.p.parm4 = parm4;
698   } // case
699   break;
700 
701   default: {
702     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
703                 KMP_HNT(GetNewerLibrary), // Hint
704                 __kmp_msg_null // Variadic argument list terminator
705                 );
706   } break;
707   } // switch
708   pr->schedule = schedule;
709 }
710 
711 #if KMP_USE_HIER_SCHED
712 template <typename T>
713 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
714                                              typename traits_t<T>::signed_t st);
715 template <>
716 inline void
717 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
718                                             kmp_int32 ub, kmp_int32 st) {
719   __kmp_dispatch_init_hierarchy<kmp_int32>(
720       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
721       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
722 }
723 template <>
724 inline void
725 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
726                                              kmp_uint32 ub, kmp_int32 st) {
727   __kmp_dispatch_init_hierarchy<kmp_uint32>(
728       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
729       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
730 }
731 template <>
732 inline void
733 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
734                                             kmp_int64 ub, kmp_int64 st) {
735   __kmp_dispatch_init_hierarchy<kmp_int64>(
736       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
737       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
738 }
739 template <>
740 inline void
741 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
742                                              kmp_uint64 ub, kmp_int64 st) {
743   __kmp_dispatch_init_hierarchy<kmp_uint64>(
744       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
745       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
746 }
747 
748 // free all the hierarchy scheduling memory associated with the team
749 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
750   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
751   for (int i = 0; i < num_disp_buff; ++i) {
752     // type does not matter here so use kmp_int32
753     auto sh =
754         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
755             &team->t.t_disp_buffer[i]);
756     if (sh->hier) {
757       sh->hier->deallocate();
758       __kmp_free(sh->hier);
759     }
760   }
761 }
762 #endif
763 
764 // UT - unsigned flavor of T, ST - signed flavor of T,
765 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
766 template <typename T>
767 static void
768 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
769                     T ub, typename traits_t<T>::signed_t st,
770                     typename traits_t<T>::signed_t chunk, int push_ws) {
771   typedef typename traits_t<T>::unsigned_t UT;
772 
773   int active;
774   kmp_info_t *th;
775   kmp_team_t *team;
776   kmp_uint32 my_buffer_index;
777   dispatch_private_info_template<T> *pr;
778   dispatch_shared_info_template<T> volatile *sh;
779 
780   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
781                    sizeof(dispatch_private_info));
782   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
783                    sizeof(dispatch_shared_info));
784   __kmp_assert_valid_gtid(gtid);
785 
786   if (!TCR_4(__kmp_init_parallel))
787     __kmp_parallel_initialize();
788 
789   __kmp_resume_if_soft_paused();
790 
791 #if INCLUDE_SSC_MARKS
792   SSC_MARK_DISPATCH_INIT();
793 #endif
794 #ifdef KMP_DEBUG
795   typedef typename traits_t<T>::signed_t ST;
796   {
797     char *buff;
798     // create format specifiers before the debug output
799     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
800                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
801                             traits_t<ST>::spec, traits_t<T>::spec,
802                             traits_t<T>::spec, traits_t<ST>::spec);
803     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
804     __kmp_str_free(&buff);
805   }
806 #endif
807   /* setup data */
808   th = __kmp_threads[gtid];
809   team = th->th.th_team;
810   active = !team->t.t_serialized;
811   th->th.th_ident = loc;
812 
813   // Any half-decent optimizer will remove this test when the blocks are empty
814   // since the macros expand to nothing
815   // when statistics are disabled.
816   if (schedule == __kmp_static) {
817     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
818   } else {
819     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
820   }
821 
822 #if KMP_USE_HIER_SCHED
823   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
824   // Hierarchical scheduling does not work with ordered, so if ordered is
825   // detected, then revert back to threaded scheduling.
826   bool ordered;
827   enum sched_type my_sched = schedule;
828   my_buffer_index = th->th.th_dispatch->th_disp_index;
829   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
830       &th->th.th_dispatch
831            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
832   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
833   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
834     my_sched =
835         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
836   ordered = (kmp_ord_lower & my_sched);
837   if (pr->flags.use_hier) {
838     if (ordered) {
839       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
840                      "Disabling hierarchical scheduling.\n",
841                      gtid));
842       pr->flags.use_hier = FALSE;
843     }
844   }
845   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
846     // Don't use hierarchical for ordered parallel loops and don't
847     // use the runtime hierarchy if one was specified in the program
848     if (!ordered && !pr->flags.use_hier)
849       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
850   }
851 #endif // KMP_USE_HIER_SCHED
852 
853 #if USE_ITT_BUILD
854   kmp_uint64 cur_chunk = chunk;
855   int itt_need_metadata_reporting =
856       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
857       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
858       team->t.t_active_level == 1;
859 #endif
860   if (!active) {
861     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
862         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
863   } else {
864     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
865                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
866 
867     my_buffer_index = th->th.th_dispatch->th_disp_index++;
868 
869     /* What happens when number of threads changes, need to resize buffer? */
870     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
871         &th->th.th_dispatch
872              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
873     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
874         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
875     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
876                   my_buffer_index));
877   }
878 
879   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
880 #if USE_ITT_BUILD
881                                 &cur_chunk,
882 #endif
883                                 chunk, (T)th->th.th_team_nproc,
884                                 (T)th->th.th_info.ds.ds_tid);
885   if (active) {
886     if (pr->flags.ordered == 0) {
887       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
888       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
889     } else {
890       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
891       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
892     }
893   }
894 
895   if (active) {
896     /* The name of this buffer should be my_buffer_index when it's free to use
897      * it */
898 
899     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
900                    "sh->buffer_index:%d\n",
901                    gtid, my_buffer_index, sh->buffer_index));
902     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
903                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
904     // Note: KMP_WAIT() cannot be used there: buffer index and
905     // my_buffer_index are *always* 32-bit integers.
906     KMP_MB(); /* is this necessary? */
907     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
908                    "sh->buffer_index:%d\n",
909                    gtid, my_buffer_index, sh->buffer_index));
910 
911     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
912     th->th.th_dispatch->th_dispatch_sh_current =
913         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
914 #if USE_ITT_BUILD
915     if (pr->flags.ordered) {
916       __kmp_itt_ordered_init(gtid);
917     }
918     // Report loop metadata
919     if (itt_need_metadata_reporting) {
920       // Only report metadata by master of active team at level 1
921       kmp_uint64 schedtype = 0;
922       switch (schedule) {
923       case kmp_sch_static_chunked:
924       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
925         break;
926       case kmp_sch_static_greedy:
927         cur_chunk = pr->u.p.parm1;
928         break;
929       case kmp_sch_dynamic_chunked:
930         schedtype = 1;
931         break;
932       case kmp_sch_guided_iterative_chunked:
933       case kmp_sch_guided_analytical_chunked:
934       case kmp_sch_guided_simd:
935         schedtype = 2;
936         break;
937       default:
938         // Should we put this case under "static"?
939         // case kmp_sch_static_steal:
940         schedtype = 3;
941         break;
942       }
943       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
944     }
945 #if KMP_USE_HIER_SCHED
946     if (pr->flags.use_hier) {
947       pr->u.p.count = 0;
948       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
949     }
950 #endif // KMP_USER_HIER_SCHED
951 #endif /* USE_ITT_BUILD */
952   }
953 
954 #ifdef KMP_DEBUG
955   {
956     char *buff;
957     // create format specifiers before the debug output
958     buff = __kmp_str_format(
959         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
960         "lb:%%%s ub:%%%s"
961         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
962         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
963         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
964         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
965         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
966         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
967     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
968                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
969                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
970                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
971     __kmp_str_free(&buff);
972   }
973 #endif
974 #if (KMP_STATIC_STEAL_ENABLED)
975   // It cannot be guaranteed that after execution of a loop with some other
976   // schedule kind all the parm3 variables will contain the same value. Even if
977   // all parm3 will be the same, it still exists a bad case like using 0 and 1
978   // rather than program life-time increment. So the dedicated variable is
979   // required. The 'static_steal_counter' is used.
980   if (pr->schedule == kmp_sch_static_steal) {
981     // Other threads will inspect this variable when searching for a victim.
982     // This is a flag showing that other threads may steal from this thread
983     // since then.
984     volatile T *p = &pr->u.p.static_steal_counter;
985     *p = *p + 1;
986   }
987 #endif // ( KMP_STATIC_STEAL_ENABLED )
988 
989 #if OMPT_SUPPORT && OMPT_OPTIONAL
990   if (ompt_enabled.ompt_callback_work) {
991     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
992     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
993     ompt_callbacks.ompt_callback(ompt_callback_work)(
994         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
995         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
996   }
997 #endif
998   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
999 }
1000 
1001 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1002  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1003  * every chunk of iterations.  If the ordered section(s) were not executed
1004  * for this iteration (or every iteration in this chunk), we need to set the
1005  * ordered iteration counters so that the next thread can proceed. */
1006 template <typename UT>
1007 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1008   typedef typename traits_t<UT>::signed_t ST;
1009   __kmp_assert_valid_gtid(gtid);
1010   kmp_info_t *th = __kmp_threads[gtid];
1011 
1012   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1013   if (!th->th.th_team->t.t_serialized) {
1014 
1015     dispatch_private_info_template<UT> *pr =
1016         reinterpret_cast<dispatch_private_info_template<UT> *>(
1017             th->th.th_dispatch->th_dispatch_pr_current);
1018     dispatch_shared_info_template<UT> volatile *sh =
1019         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1020             th->th.th_dispatch->th_dispatch_sh_current);
1021     KMP_DEBUG_ASSERT(pr);
1022     KMP_DEBUG_ASSERT(sh);
1023     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1024                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1025 
1026     if (pr->ordered_bumped) {
1027       KD_TRACE(
1028           1000,
1029           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1030            gtid));
1031       pr->ordered_bumped = 0;
1032     } else {
1033       UT lower = pr->u.p.ordered_lower;
1034 
1035 #ifdef KMP_DEBUG
1036       {
1037         char *buff;
1038         // create format specifiers before the debug output
1039         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1040                                 "ordered_iteration:%%%s lower:%%%s\n",
1041                                 traits_t<UT>::spec, traits_t<UT>::spec);
1042         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1043         __kmp_str_free(&buff);
1044       }
1045 #endif
1046 
1047       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1048                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1049       KMP_MB(); /* is this necessary? */
1050 #ifdef KMP_DEBUG
1051       {
1052         char *buff;
1053         // create format specifiers before the debug output
1054         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1055                                 "ordered_iteration:%%%s lower:%%%s\n",
1056                                 traits_t<UT>::spec, traits_t<UT>::spec);
1057         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1058         __kmp_str_free(&buff);
1059       }
1060 #endif
1061 
1062       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1063     } // if
1064   } // if
1065   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1066 }
1067 
1068 #ifdef KMP_GOMP_COMPAT
1069 
1070 template <typename UT>
1071 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1072   typedef typename traits_t<UT>::signed_t ST;
1073   __kmp_assert_valid_gtid(gtid);
1074   kmp_info_t *th = __kmp_threads[gtid];
1075 
1076   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1077   if (!th->th.th_team->t.t_serialized) {
1078     //        int cid;
1079     dispatch_private_info_template<UT> *pr =
1080         reinterpret_cast<dispatch_private_info_template<UT> *>(
1081             th->th.th_dispatch->th_dispatch_pr_current);
1082     dispatch_shared_info_template<UT> volatile *sh =
1083         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1084             th->th.th_dispatch->th_dispatch_sh_current);
1085     KMP_DEBUG_ASSERT(pr);
1086     KMP_DEBUG_ASSERT(sh);
1087     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1088                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1089 
1090     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1091     UT lower = pr->u.p.ordered_lower;
1092     UT upper = pr->u.p.ordered_upper;
1093     UT inc = upper - lower + 1;
1094 
1095     if (pr->ordered_bumped == inc) {
1096       KD_TRACE(
1097           1000,
1098           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1099            gtid));
1100       pr->ordered_bumped = 0;
1101     } else {
1102       inc -= pr->ordered_bumped;
1103 
1104 #ifdef KMP_DEBUG
1105       {
1106         char *buff;
1107         // create format specifiers before the debug output
1108         buff = __kmp_str_format(
1109             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1110             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1111             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1112         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1113         __kmp_str_free(&buff);
1114       }
1115 #endif
1116 
1117       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1118                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1119 
1120       KMP_MB(); /* is this necessary? */
1121       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1122                       "ordered_bumped to zero\n",
1123                       gtid));
1124       pr->ordered_bumped = 0;
1125 //!!!!! TODO check if the inc should be unsigned, or signed???
1126 #ifdef KMP_DEBUG
1127       {
1128         char *buff;
1129         // create format specifiers before the debug output
1130         buff = __kmp_str_format(
1131             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1132             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1133             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1134             traits_t<UT>::spec);
1135         KD_TRACE(1000,
1136                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1137         __kmp_str_free(&buff);
1138       }
1139 #endif
1140 
1141       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1142     }
1143     //        }
1144   }
1145   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1146 }
1147 
1148 #endif /* KMP_GOMP_COMPAT */
1149 
1150 template <typename T>
1151 int __kmp_dispatch_next_algorithm(int gtid,
1152                                   dispatch_private_info_template<T> *pr,
1153                                   dispatch_shared_info_template<T> volatile *sh,
1154                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1155                                   typename traits_t<T>::signed_t *p_st, T nproc,
1156                                   T tid) {
1157   typedef typename traits_t<T>::unsigned_t UT;
1158   typedef typename traits_t<T>::signed_t ST;
1159   typedef typename traits_t<T>::floating_t DBL;
1160   int status = 0;
1161   bool last = false;
1162   T start;
1163   ST incr;
1164   UT limit, trip, init;
1165   kmp_info_t *th = __kmp_threads[gtid];
1166   kmp_team_t *team = th->th.th_team;
1167 
1168   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1169                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1170   KMP_DEBUG_ASSERT(pr);
1171   KMP_DEBUG_ASSERT(sh);
1172   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1173 #ifdef KMP_DEBUG
1174   {
1175     char *buff;
1176     // create format specifiers before the debug output
1177     buff =
1178         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1179                          "sh:%%p nproc:%%%s tid:%%%s\n",
1180                          traits_t<T>::spec, traits_t<T>::spec);
1181     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1182     __kmp_str_free(&buff);
1183   }
1184 #endif
1185 
1186   // zero trip count
1187   if (pr->u.p.tc == 0) {
1188     KD_TRACE(10,
1189              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1190               "zero status:%d\n",
1191               gtid, status));
1192     return 0;
1193   }
1194 
1195   switch (pr->schedule) {
1196 #if (KMP_STATIC_STEAL_ENABLED)
1197   case kmp_sch_static_steal: {
1198     T chunk = pr->u.p.parm1;
1199 
1200     KD_TRACE(100,
1201              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1202               gtid));
1203 
1204     trip = pr->u.p.tc - 1;
1205 
1206     if (traits_t<T>::type_size > 4) {
1207       // use lock for 8-byte and CAS for 4-byte induction
1208       // variable. TODO (optional): check and use 16-byte CAS
1209       kmp_lock_t *lck = pr->u.p.th_steal_lock;
1210       KMP_DEBUG_ASSERT(lck != NULL);
1211       if (pr->u.p.count < (UT)pr->u.p.ub) {
1212         __kmp_acquire_lock(lck, gtid);
1213         // try to get own chunk of iterations
1214         init = (pr->u.p.count)++;
1215         status = (init < (UT)pr->u.p.ub);
1216         __kmp_release_lock(lck, gtid);
1217       } else {
1218         status = 0; // no own chunks
1219       }
1220       if (!status) { // try to steal
1221         kmp_info_t **other_threads = team->t.t_threads;
1222         T while_limit = pr->u.p.parm3;
1223         T while_index = 0;
1224         T id = pr->u.p.static_steal_counter; // loop id
1225         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1226                   __kmp_dispatch_num_buffers; // current loop index
1227         // note: victim thread can potentially execute another loop
1228         // TODO: algorithm of searching for a victim
1229         // should be cleaned up and measured
1230         while ((!status) && (while_limit != ++while_index)) {
1231           dispatch_private_info_template<T> *victim;
1232           T remaining;
1233           T victimIdx = pr->u.p.parm4;
1234           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1235           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1236               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1237           KMP_DEBUG_ASSERT(victim);
1238           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1239                  oldVictimIdx != victimIdx) {
1240             victimIdx = (victimIdx + 1) % nproc;
1241             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1242                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1243             KMP_DEBUG_ASSERT(victim);
1244           }
1245           if (victim == pr || id != victim->u.p.static_steal_counter) {
1246             continue; // try once more (nproc attempts in total)
1247             // no victim is ready yet to participate in stealing
1248             // because no victim passed kmp_init_dispatch yet
1249           }
1250           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1251             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1252             continue; // not enough chunks to steal, goto next victim
1253           }
1254 
1255           lck = victim->u.p.th_steal_lock;
1256           KMP_ASSERT(lck != NULL);
1257           __kmp_acquire_lock(lck, gtid);
1258           limit = victim->u.p.ub; // keep initial ub
1259           if (victim->u.p.count >= limit ||
1260               (remaining = limit - victim->u.p.count) < 2) {
1261             __kmp_release_lock(lck, gtid);
1262             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1263             continue; // not enough chunks to steal
1264           }
1265           // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
1266           // by 1
1267           if (remaining > 3) {
1268             // steal 1/4 of remaining
1269             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1270             init = (victim->u.p.ub -= (remaining >> 2));
1271           } else {
1272             // steal 1 chunk of 2 or 3 remaining
1273             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1274             init = (victim->u.p.ub -= 1);
1275           }
1276           __kmp_release_lock(lck, gtid);
1277 
1278           KMP_DEBUG_ASSERT(init + 1 <= limit);
1279           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1280           status = 1;
1281           while_index = 0;
1282           // now update own count and ub with stolen range but init chunk
1283           __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
1284           pr->u.p.count = init + 1;
1285           pr->u.p.ub = limit;
1286           __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
1287         } // while (search for victim)
1288       } // if (try to find victim and steal)
1289     } else {
1290       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1291       typedef union {
1292         struct {
1293           UT count;
1294           T ub;
1295         } p;
1296         kmp_int64 b;
1297       } union_i4;
1298       // All operations on 'count' or 'ub' must be combined atomically
1299       // together.
1300       {
1301         union_i4 vold, vnew;
1302         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1303         vnew = vold;
1304         vnew.p.count++;
1305         while (!KMP_COMPARE_AND_STORE_ACQ64(
1306             (volatile kmp_int64 *)&pr->u.p.count,
1307             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1308             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1309           KMP_CPU_PAUSE();
1310           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1311           vnew = vold;
1312           vnew.p.count++;
1313         }
1314         vnew = vold;
1315         init = vnew.p.count;
1316         status = (init < (UT)vnew.p.ub);
1317       }
1318 
1319       if (!status) {
1320         kmp_info_t **other_threads = team->t.t_threads;
1321         T while_limit = pr->u.p.parm3;
1322         T while_index = 0;
1323         T id = pr->u.p.static_steal_counter; // loop id
1324         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1325                   __kmp_dispatch_num_buffers; // current loop index
1326         // note: victim thread can potentially execute another loop
1327         // TODO: algorithm of searching for a victim
1328         // should be cleaned up and measured
1329         while ((!status) && (while_limit != ++while_index)) {
1330           dispatch_private_info_template<T> *victim;
1331           union_i4 vold, vnew;
1332           T remaining;
1333           T victimIdx = pr->u.p.parm4;
1334           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1335           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1336               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1337           KMP_DEBUG_ASSERT(victim);
1338           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1339                  oldVictimIdx != victimIdx) {
1340             victimIdx = (victimIdx + 1) % nproc;
1341             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1342                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1343             KMP_DEBUG_ASSERT(victim);
1344           }
1345           if (victim == pr || id != victim->u.p.static_steal_counter) {
1346             continue; // try once more (nproc attempts in total)
1347             // no victim is ready yet to participate in stealing
1348             // because no victim passed kmp_init_dispatch yet
1349           }
1350           pr->u.p.parm4 = victimIdx; // new victim found
1351           while (1) { // CAS loop if victim has enough chunks to steal
1352             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1353             vnew = vold;
1354 
1355             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1356             if (vnew.p.count >= (UT)vnew.p.ub ||
1357                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1358               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1359               break; // not enough chunks to steal, goto next victim
1360             }
1361             if (remaining > 3) {
1362               // try to steal 1/4 of remaining
1363               vnew.p.ub -= remaining >> 2;
1364             } else {
1365               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1366             }
1367             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1368             // TODO: Should this be acquire or release?
1369             if (KMP_COMPARE_AND_STORE_ACQ64(
1370                     (volatile kmp_int64 *)&victim->u.p.count,
1371                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1372                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1373               // stealing succeeded
1374               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1375                                         vold.p.ub - vnew.p.ub);
1376               status = 1;
1377               while_index = 0;
1378               // now update own count and ub
1379               init = vnew.p.ub;
1380               vold.p.count = init + 1;
1381 #if KMP_ARCH_X86
1382               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1383 #else
1384               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1385 #endif
1386               break;
1387             } // if (check CAS result)
1388             KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1389           } // while (try to steal from particular victim)
1390         } // while (search for victim)
1391       } // if (try to find victim and steal)
1392     } // if (4-byte induction variable)
1393     if (!status) {
1394       *p_lb = 0;
1395       *p_ub = 0;
1396       if (p_st != NULL)
1397         *p_st = 0;
1398     } else {
1399       start = pr->u.p.parm2;
1400       init *= chunk;
1401       limit = chunk + init - 1;
1402       incr = pr->u.p.st;
1403       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1404 
1405       KMP_DEBUG_ASSERT(init <= trip);
1406       if ((last = (limit >= trip)) != 0)
1407         limit = trip;
1408       if (p_st != NULL)
1409         *p_st = incr;
1410 
1411       if (incr == 1) {
1412         *p_lb = start + init;
1413         *p_ub = start + limit;
1414       } else {
1415         *p_lb = start + init * incr;
1416         *p_ub = start + limit * incr;
1417       }
1418 
1419       if (pr->flags.ordered) {
1420         pr->u.p.ordered_lower = init;
1421         pr->u.p.ordered_upper = limit;
1422       } // if
1423     } // if
1424     break;
1425   } // case
1426 #endif // ( KMP_STATIC_STEAL_ENABLED )
1427   case kmp_sch_static_balanced: {
1428     KD_TRACE(
1429         10,
1430         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1431          gtid));
1432     /* check if thread has any iteration to do */
1433     if ((status = !pr->u.p.count) != 0) {
1434       pr->u.p.count = 1;
1435       *p_lb = pr->u.p.lb;
1436       *p_ub = pr->u.p.ub;
1437       last = (pr->u.p.parm1 != 0);
1438       if (p_st != NULL)
1439         *p_st = pr->u.p.st;
1440     } else { /* no iterations to do */
1441       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1442     }
1443   } // case
1444   break;
1445   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1446                                  merged here */
1447   case kmp_sch_static_chunked: {
1448     T parm1;
1449 
1450     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1451                    "kmp_sch_static_[affinity|chunked] case\n",
1452                    gtid));
1453     parm1 = pr->u.p.parm1;
1454 
1455     trip = pr->u.p.tc - 1;
1456     init = parm1 * (pr->u.p.count + tid);
1457 
1458     if ((status = (init <= trip)) != 0) {
1459       start = pr->u.p.lb;
1460       incr = pr->u.p.st;
1461       limit = parm1 + init - 1;
1462 
1463       if ((last = (limit >= trip)) != 0)
1464         limit = trip;
1465 
1466       if (p_st != NULL)
1467         *p_st = incr;
1468 
1469       pr->u.p.count += nproc;
1470 
1471       if (incr == 1) {
1472         *p_lb = start + init;
1473         *p_ub = start + limit;
1474       } else {
1475         *p_lb = start + init * incr;
1476         *p_ub = start + limit * incr;
1477       }
1478 
1479       if (pr->flags.ordered) {
1480         pr->u.p.ordered_lower = init;
1481         pr->u.p.ordered_upper = limit;
1482       } // if
1483     } // if
1484   } // case
1485   break;
1486 
1487   case kmp_sch_dynamic_chunked: {
1488     T chunk = pr->u.p.parm1;
1489 
1490     KD_TRACE(
1491         100,
1492         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1493          gtid));
1494 
1495     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1496     trip = pr->u.p.tc - 1;
1497 
1498     if ((status = (init <= trip)) == 0) {
1499       *p_lb = 0;
1500       *p_ub = 0;
1501       if (p_st != NULL)
1502         *p_st = 0;
1503     } else {
1504       start = pr->u.p.lb;
1505       limit = chunk + init - 1;
1506       incr = pr->u.p.st;
1507 
1508       if ((last = (limit >= trip)) != 0)
1509         limit = trip;
1510 
1511       if (p_st != NULL)
1512         *p_st = incr;
1513 
1514       if (incr == 1) {
1515         *p_lb = start + init;
1516         *p_ub = start + limit;
1517       } else {
1518         *p_lb = start + init * incr;
1519         *p_ub = start + limit * incr;
1520       }
1521 
1522       if (pr->flags.ordered) {
1523         pr->u.p.ordered_lower = init;
1524         pr->u.p.ordered_upper = limit;
1525       } // if
1526     } // if
1527   } // case
1528   break;
1529 
1530   case kmp_sch_guided_iterative_chunked: {
1531     T chunkspec = pr->u.p.parm1;
1532     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1533                    "iterative case\n",
1534                    gtid));
1535     trip = pr->u.p.tc;
1536     // Start atomic part of calculations
1537     while (1) {
1538       ST remaining; // signed, because can be < 0
1539       init = sh->u.s.iteration; // shared value
1540       remaining = trip - init;
1541       if (remaining <= 0) { // AC: need to compare with 0 first
1542         // nothing to do, don't try atomic op
1543         status = 0;
1544         break;
1545       }
1546       if ((T)remaining <
1547           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1548         // use dynamic-style schedule
1549         // atomically increment iterations, get old value
1550         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1551                                  (ST)chunkspec);
1552         remaining = trip - init;
1553         if (remaining <= 0) {
1554           status = 0; // all iterations got by other threads
1555         } else {
1556           // got some iterations to work on
1557           status = 1;
1558           if ((T)remaining > chunkspec) {
1559             limit = init + chunkspec - 1;
1560           } else {
1561             last = true; // the last chunk
1562             limit = init + remaining - 1;
1563           } // if
1564         } // if
1565         break;
1566       } // if
1567       limit = init + (UT)((double)remaining *
1568                           *(double *)&pr->u.p.parm3); // divide by K*nproc
1569       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1570                                (ST)init, (ST)limit)) {
1571         // CAS was successful, chunk obtained
1572         status = 1;
1573         --limit;
1574         break;
1575       } // if
1576     } // while
1577     if (status != 0) {
1578       start = pr->u.p.lb;
1579       incr = pr->u.p.st;
1580       if (p_st != NULL)
1581         *p_st = incr;
1582       *p_lb = start + init * incr;
1583       *p_ub = start + limit * incr;
1584       if (pr->flags.ordered) {
1585         pr->u.p.ordered_lower = init;
1586         pr->u.p.ordered_upper = limit;
1587       } // if
1588     } else {
1589       *p_lb = 0;
1590       *p_ub = 0;
1591       if (p_st != NULL)
1592         *p_st = 0;
1593     } // if
1594   } // case
1595   break;
1596 
1597   case kmp_sch_guided_simd: {
1598     // same as iterative but curr-chunk adjusted to be multiple of given
1599     // chunk
1600     T chunk = pr->u.p.parm1;
1601     KD_TRACE(100,
1602              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1603               gtid));
1604     trip = pr->u.p.tc;
1605     // Start atomic part of calculations
1606     while (1) {
1607       ST remaining; // signed, because can be < 0
1608       init = sh->u.s.iteration; // shared value
1609       remaining = trip - init;
1610       if (remaining <= 0) { // AC: need to compare with 0 first
1611         status = 0; // nothing to do, don't try atomic op
1612         break;
1613       }
1614       KMP_DEBUG_ASSERT(init % chunk == 0);
1615       // compare with K*nproc*(chunk+1), K=2 by default
1616       if ((T)remaining < pr->u.p.parm2) {
1617         // use dynamic-style schedule
1618         // atomically increment iterations, get old value
1619         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1620                                  (ST)chunk);
1621         remaining = trip - init;
1622         if (remaining <= 0) {
1623           status = 0; // all iterations got by other threads
1624         } else {
1625           // got some iterations to work on
1626           status = 1;
1627           if ((T)remaining > chunk) {
1628             limit = init + chunk - 1;
1629           } else {
1630             last = true; // the last chunk
1631             limit = init + remaining - 1;
1632           } // if
1633         } // if
1634         break;
1635       } // if
1636       // divide by K*nproc
1637       UT span;
1638       __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1639                          &span);
1640       UT rem = span % chunk;
1641       if (rem) // adjust so that span%chunk == 0
1642         span += chunk - rem;
1643       limit = init + span;
1644       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1645                                (ST)init, (ST)limit)) {
1646         // CAS was successful, chunk obtained
1647         status = 1;
1648         --limit;
1649         break;
1650       } // if
1651     } // while
1652     if (status != 0) {
1653       start = pr->u.p.lb;
1654       incr = pr->u.p.st;
1655       if (p_st != NULL)
1656         *p_st = incr;
1657       *p_lb = start + init * incr;
1658       *p_ub = start + limit * incr;
1659       if (pr->flags.ordered) {
1660         pr->u.p.ordered_lower = init;
1661         pr->u.p.ordered_upper = limit;
1662       } // if
1663     } else {
1664       *p_lb = 0;
1665       *p_ub = 0;
1666       if (p_st != NULL)
1667         *p_st = 0;
1668     } // if
1669   } // case
1670   break;
1671 
1672   case kmp_sch_guided_analytical_chunked: {
1673     T chunkspec = pr->u.p.parm1;
1674     UT chunkIdx;
1675 #if KMP_USE_X87CONTROL
1676     /* for storing original FPCW value for Windows* OS on
1677        IA-32 architecture 8-byte version */
1678     unsigned int oldFpcw;
1679     unsigned int fpcwSet = 0;
1680 #endif
1681     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1682                    "kmp_sch_guided_analytical_chunked case\n",
1683                    gtid));
1684 
1685     trip = pr->u.p.tc;
1686 
1687     KMP_DEBUG_ASSERT(nproc > 1);
1688     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1689 
1690     while (1) { /* this while loop is a safeguard against unexpected zero
1691                    chunk sizes */
1692       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1693       if (chunkIdx >= (UT)pr->u.p.parm2) {
1694         --trip;
1695         /* use dynamic-style scheduling */
1696         init = chunkIdx * chunkspec + pr->u.p.count;
1697         /* need to verify init > 0 in case of overflow in the above
1698          * calculation */
1699         if ((status = (init > 0 && init <= trip)) != 0) {
1700           limit = init + chunkspec - 1;
1701 
1702           if ((last = (limit >= trip)) != 0)
1703             limit = trip;
1704         }
1705         break;
1706       } else {
1707 /* use exponential-style scheduling */
1708 /* The following check is to workaround the lack of long double precision on
1709    Windows* OS.
1710    This check works around the possible effect that init != 0 for chunkIdx == 0.
1711  */
1712 #if KMP_USE_X87CONTROL
1713         /* If we haven't already done so, save original
1714            FPCW and set precision to 64-bit, as Windows* OS
1715            on IA-32 architecture defaults to 53-bit */
1716         if (!fpcwSet) {
1717           oldFpcw = _control87(0, 0);
1718           _control87(_PC_64, _MCW_PC);
1719           fpcwSet = 0x30000;
1720         }
1721 #endif
1722         if (chunkIdx) {
1723           init = __kmp_dispatch_guided_remaining<T>(
1724               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1725           KMP_DEBUG_ASSERT(init);
1726           init = trip - init;
1727         } else
1728           init = 0;
1729         limit = trip - __kmp_dispatch_guided_remaining<T>(
1730                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1731         KMP_ASSERT(init <= limit);
1732         if (init < limit) {
1733           KMP_DEBUG_ASSERT(limit <= trip);
1734           --limit;
1735           status = 1;
1736           break;
1737         } // if
1738       } // if
1739     } // while (1)
1740 #if KMP_USE_X87CONTROL
1741     /* restore FPCW if necessary
1742        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1743     */
1744     if (fpcwSet && (oldFpcw & fpcwSet))
1745       _control87(oldFpcw, _MCW_PC);
1746 #endif
1747     if (status != 0) {
1748       start = pr->u.p.lb;
1749       incr = pr->u.p.st;
1750       if (p_st != NULL)
1751         *p_st = incr;
1752       *p_lb = start + init * incr;
1753       *p_ub = start + limit * incr;
1754       if (pr->flags.ordered) {
1755         pr->u.p.ordered_lower = init;
1756         pr->u.p.ordered_upper = limit;
1757       }
1758     } else {
1759       *p_lb = 0;
1760       *p_ub = 0;
1761       if (p_st != NULL)
1762         *p_st = 0;
1763     }
1764   } // case
1765   break;
1766 
1767   case kmp_sch_trapezoidal: {
1768     UT index;
1769     T parm2 = pr->u.p.parm2;
1770     T parm3 = pr->u.p.parm3;
1771     T parm4 = pr->u.p.parm4;
1772     KD_TRACE(100,
1773              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1774               gtid));
1775 
1776     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1777 
1778     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1779     trip = pr->u.p.tc - 1;
1780 
1781     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1782       *p_lb = 0;
1783       *p_ub = 0;
1784       if (p_st != NULL)
1785         *p_st = 0;
1786     } else {
1787       start = pr->u.p.lb;
1788       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1789       incr = pr->u.p.st;
1790 
1791       if ((last = (limit >= trip)) != 0)
1792         limit = trip;
1793 
1794       if (p_st != NULL)
1795         *p_st = incr;
1796 
1797       if (incr == 1) {
1798         *p_lb = start + init;
1799         *p_ub = start + limit;
1800       } else {
1801         *p_lb = start + init * incr;
1802         *p_ub = start + limit * incr;
1803       }
1804 
1805       if (pr->flags.ordered) {
1806         pr->u.p.ordered_lower = init;
1807         pr->u.p.ordered_upper = limit;
1808       } // if
1809     } // if
1810   } // case
1811   break;
1812   default: {
1813     status = 0; // to avoid complaints on uninitialized variable use
1814     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1815                 KMP_HNT(GetNewerLibrary), // Hint
1816                 __kmp_msg_null // Variadic argument list terminator
1817                 );
1818   } break;
1819   } // switch
1820   if (p_last)
1821     *p_last = last;
1822 #ifdef KMP_DEBUG
1823   if (pr->flags.ordered) {
1824     char *buff;
1825     // create format specifiers before the debug output
1826     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1827                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1828                             traits_t<UT>::spec, traits_t<UT>::spec);
1829     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1830     __kmp_str_free(&buff);
1831   }
1832   {
1833     char *buff;
1834     // create format specifiers before the debug output
1835     buff = __kmp_str_format(
1836         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1837         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1838         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1839     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1840     __kmp_str_free(&buff);
1841   }
1842 #endif
1843   return status;
1844 }
1845 
1846 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1847    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1848    is not called. */
1849 #if OMPT_SUPPORT && OMPT_OPTIONAL
1850 #define OMPT_LOOP_END                                                          \
1851   if (status == 0) {                                                           \
1852     if (ompt_enabled.ompt_callback_work) {                                     \
1853       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1854       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1855       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1856           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1857           &(task_info->task_data), 0, codeptr);                                \
1858     }                                                                          \
1859   }
1860 // TODO: implement count
1861 #else
1862 #define OMPT_LOOP_END // no-op
1863 #endif
1864 
1865 #if KMP_STATS_ENABLED
1866 #define KMP_STATS_LOOP_END                                                     \
1867   {                                                                            \
1868     kmp_int64 u, l, t, i;                                                      \
1869     l = (kmp_int64)(*p_lb);                                                    \
1870     u = (kmp_int64)(*p_ub);                                                    \
1871     i = (kmp_int64)(pr->u.p.st);                                               \
1872     if (status == 0) {                                                         \
1873       t = 0;                                                                   \
1874       KMP_POP_PARTITIONED_TIMER();                                             \
1875     } else if (i == 1) {                                                       \
1876       if (u >= l)                                                              \
1877         t = u - l + 1;                                                         \
1878       else                                                                     \
1879         t = 0;                                                                 \
1880     } else if (i < 0) {                                                        \
1881       if (l >= u)                                                              \
1882         t = (l - u) / (-i) + 1;                                                \
1883       else                                                                     \
1884         t = 0;                                                                 \
1885     } else {                                                                   \
1886       if (u >= l)                                                              \
1887         t = (u - l) / i + 1;                                                   \
1888       else                                                                     \
1889         t = 0;                                                                 \
1890     }                                                                          \
1891     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1892   }
1893 #else
1894 #define KMP_STATS_LOOP_END /* Nothing */
1895 #endif
1896 
1897 template <typename T>
1898 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1899                                T *p_lb, T *p_ub,
1900                                typename traits_t<T>::signed_t *p_st
1901 #if OMPT_SUPPORT && OMPT_OPTIONAL
1902                                ,
1903                                void *codeptr
1904 #endif
1905                                ) {
1906 
1907   typedef typename traits_t<T>::unsigned_t UT;
1908   typedef typename traits_t<T>::signed_t ST;
1909   // This is potentially slightly misleading, schedule(runtime) will appear here
1910   // even if the actual runtime schedule is static. (Which points out a
1911   // disadvantage of schedule(runtime): even when static scheduling is used it
1912   // costs more than a compile time choice to use static scheduling would.)
1913   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1914 
1915   int status;
1916   dispatch_private_info_template<T> *pr;
1917   __kmp_assert_valid_gtid(gtid);
1918   kmp_info_t *th = __kmp_threads[gtid];
1919   kmp_team_t *team = th->th.th_team;
1920 
1921   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1922   KD_TRACE(
1923       1000,
1924       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1925        gtid, p_lb, p_ub, p_st, p_last));
1926 
1927   if (team->t.t_serialized) {
1928     /* NOTE: serialize this dispatch because we are not at the active level */
1929     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1930         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1931     KMP_DEBUG_ASSERT(pr);
1932 
1933     if ((status = (pr->u.p.tc != 0)) == 0) {
1934       *p_lb = 0;
1935       *p_ub = 0;
1936       //            if ( p_last != NULL )
1937       //                *p_last = 0;
1938       if (p_st != NULL)
1939         *p_st = 0;
1940       if (__kmp_env_consistency_check) {
1941         if (pr->pushed_ws != ct_none) {
1942           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1943         }
1944       }
1945     } else if (pr->flags.nomerge) {
1946       kmp_int32 last;
1947       T start;
1948       UT limit, trip, init;
1949       ST incr;
1950       T chunk = pr->u.p.parm1;
1951 
1952       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1953                      gtid));
1954 
1955       init = chunk * pr->u.p.count++;
1956       trip = pr->u.p.tc - 1;
1957 
1958       if ((status = (init <= trip)) == 0) {
1959         *p_lb = 0;
1960         *p_ub = 0;
1961         //                if ( p_last != NULL )
1962         //                    *p_last = 0;
1963         if (p_st != NULL)
1964           *p_st = 0;
1965         if (__kmp_env_consistency_check) {
1966           if (pr->pushed_ws != ct_none) {
1967             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1968           }
1969         }
1970       } else {
1971         start = pr->u.p.lb;
1972         limit = chunk + init - 1;
1973         incr = pr->u.p.st;
1974 
1975         if ((last = (limit >= trip)) != 0) {
1976           limit = trip;
1977 #if KMP_OS_WINDOWS
1978           pr->u.p.last_upper = pr->u.p.ub;
1979 #endif /* KMP_OS_WINDOWS */
1980         }
1981         if (p_last != NULL)
1982           *p_last = last;
1983         if (p_st != NULL)
1984           *p_st = incr;
1985         if (incr == 1) {
1986           *p_lb = start + init;
1987           *p_ub = start + limit;
1988         } else {
1989           *p_lb = start + init * incr;
1990           *p_ub = start + limit * incr;
1991         }
1992 
1993         if (pr->flags.ordered) {
1994           pr->u.p.ordered_lower = init;
1995           pr->u.p.ordered_upper = limit;
1996 #ifdef KMP_DEBUG
1997           {
1998             char *buff;
1999             // create format specifiers before the debug output
2000             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2001                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
2002                                     traits_t<UT>::spec, traits_t<UT>::spec);
2003             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2004                             pr->u.p.ordered_upper));
2005             __kmp_str_free(&buff);
2006           }
2007 #endif
2008         } // if
2009       } // if
2010     } else {
2011       pr->u.p.tc = 0;
2012       *p_lb = pr->u.p.lb;
2013       *p_ub = pr->u.p.ub;
2014 #if KMP_OS_WINDOWS
2015       pr->u.p.last_upper = *p_ub;
2016 #endif /* KMP_OS_WINDOWS */
2017       if (p_last != NULL)
2018         *p_last = TRUE;
2019       if (p_st != NULL)
2020         *p_st = pr->u.p.st;
2021     } // if
2022 #ifdef KMP_DEBUG
2023     {
2024       char *buff;
2025       // create format specifiers before the debug output
2026       buff = __kmp_str_format(
2027           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2028           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2029           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2030       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2031                     (p_last ? *p_last : 0), status));
2032       __kmp_str_free(&buff);
2033     }
2034 #endif
2035 #if INCLUDE_SSC_MARKS
2036     SSC_MARK_DISPATCH_NEXT();
2037 #endif
2038     OMPT_LOOP_END;
2039     KMP_STATS_LOOP_END;
2040     return status;
2041   } else {
2042     kmp_int32 last = 0;
2043     dispatch_shared_info_template<T> volatile *sh;
2044 
2045     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2046                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2047 
2048     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2049         th->th.th_dispatch->th_dispatch_pr_current);
2050     KMP_DEBUG_ASSERT(pr);
2051     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2052         th->th.th_dispatch->th_dispatch_sh_current);
2053     KMP_DEBUG_ASSERT(sh);
2054 
2055 #if KMP_USE_HIER_SCHED
2056     if (pr->flags.use_hier)
2057       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2058     else
2059 #endif // KMP_USE_HIER_SCHED
2060       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2061                                                 p_st, th->th.th_team_nproc,
2062                                                 th->th.th_info.ds.ds_tid);
2063     // status == 0: no more iterations to execute
2064     if (status == 0) {
2065       UT num_done;
2066 
2067       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2068 #ifdef KMP_DEBUG
2069       {
2070         char *buff;
2071         // create format specifiers before the debug output
2072         buff = __kmp_str_format(
2073             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2074             traits_t<UT>::spec);
2075         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2076         __kmp_str_free(&buff);
2077       }
2078 #endif
2079 
2080 #if KMP_USE_HIER_SCHED
2081       pr->flags.use_hier = FALSE;
2082 #endif
2083       if ((ST)num_done == th->th.th_team_nproc - 1) {
2084 #if (KMP_STATIC_STEAL_ENABLED)
2085         if (pr->schedule == kmp_sch_static_steal &&
2086             traits_t<T>::type_size > 4) {
2087           int i;
2088           int idx = (th->th.th_dispatch->th_disp_index - 1) %
2089                     __kmp_dispatch_num_buffers; // current loop index
2090           kmp_info_t **other_threads = team->t.t_threads;
2091           // loop complete, safe to destroy locks used for stealing
2092           for (i = 0; i < th->th.th_team_nproc; ++i) {
2093             dispatch_private_info_template<T> *buf =
2094                 reinterpret_cast<dispatch_private_info_template<T> *>(
2095                     &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
2096             kmp_lock_t *lck = buf->u.p.th_steal_lock;
2097             KMP_ASSERT(lck != NULL);
2098             __kmp_destroy_lock(lck);
2099             __kmp_free(lck);
2100             buf->u.p.th_steal_lock = NULL;
2101           }
2102         }
2103 #endif
2104         /* NOTE: release this buffer to be reused */
2105 
2106         KMP_MB(); /* Flush all pending memory write invalidates.  */
2107 
2108         sh->u.s.num_done = 0;
2109         sh->u.s.iteration = 0;
2110 
2111         /* TODO replace with general release procedure? */
2112         if (pr->flags.ordered) {
2113           sh->u.s.ordered_iteration = 0;
2114         }
2115 
2116         KMP_MB(); /* Flush all pending memory write invalidates.  */
2117 
2118         sh->buffer_index += __kmp_dispatch_num_buffers;
2119         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2120                        gtid, sh->buffer_index));
2121 
2122         KMP_MB(); /* Flush all pending memory write invalidates.  */
2123 
2124       } // if
2125       if (__kmp_env_consistency_check) {
2126         if (pr->pushed_ws != ct_none) {
2127           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2128         }
2129       }
2130 
2131       th->th.th_dispatch->th_deo_fcn = NULL;
2132       th->th.th_dispatch->th_dxo_fcn = NULL;
2133       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2134       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2135     } // if (status == 0)
2136 #if KMP_OS_WINDOWS
2137     else if (last) {
2138       pr->u.p.last_upper = pr->u.p.ub;
2139     }
2140 #endif /* KMP_OS_WINDOWS */
2141     if (p_last != NULL && status != 0)
2142       *p_last = last;
2143   } // if
2144 
2145 #ifdef KMP_DEBUG
2146   {
2147     char *buff;
2148     // create format specifiers before the debug output
2149     buff = __kmp_str_format(
2150         "__kmp_dispatch_next: T#%%d normal case: "
2151         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2152         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2153     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2154                   (p_last ? *p_last : 0), status));
2155     __kmp_str_free(&buff);
2156   }
2157 #endif
2158 #if INCLUDE_SSC_MARKS
2159   SSC_MARK_DISPATCH_NEXT();
2160 #endif
2161   OMPT_LOOP_END;
2162   KMP_STATS_LOOP_END;
2163   return status;
2164 }
2165 
2166 template <typename T>
2167 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2168                                   kmp_int32 *plastiter, T *plower, T *pupper,
2169                                   typename traits_t<T>::signed_t incr) {
2170   typedef typename traits_t<T>::unsigned_t UT;
2171   kmp_uint32 team_id;
2172   kmp_uint32 nteams;
2173   UT trip_count;
2174   kmp_team_t *team;
2175   kmp_info_t *th;
2176 
2177   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2178   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2179 #ifdef KMP_DEBUG
2180   typedef typename traits_t<T>::signed_t ST;
2181   {
2182     char *buff;
2183     // create format specifiers before the debug output
2184     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2185                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2186                             traits_t<T>::spec, traits_t<T>::spec,
2187                             traits_t<ST>::spec, traits_t<T>::spec);
2188     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2189     __kmp_str_free(&buff);
2190   }
2191 #endif
2192 
2193   if (__kmp_env_consistency_check) {
2194     if (incr == 0) {
2195       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2196                             loc);
2197     }
2198     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2199       // The loop is illegal.
2200       // Some zero-trip loops maintained by compiler, e.g.:
2201       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2202       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2203       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2204       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2205       // Compiler does not check the following illegal loops:
2206       //   for(i=0;i<10;i+=incr) // where incr<0
2207       //   for(i=10;i>0;i-=incr) // where incr<0
2208       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2209     }
2210   }
2211   __kmp_assert_valid_gtid(gtid);
2212   th = __kmp_threads[gtid];
2213   team = th->th.th_team;
2214   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2215   nteams = th->th.th_teams_size.nteams;
2216   team_id = team->t.t_master_tid;
2217   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2218 
2219   // compute global trip count
2220   if (incr == 1) {
2221     trip_count = *pupper - *plower + 1;
2222   } else if (incr == -1) {
2223     trip_count = *plower - *pupper + 1;
2224   } else if (incr > 0) {
2225     // upper-lower can exceed the limit of signed type
2226     trip_count = (UT)(*pupper - *plower) / incr + 1;
2227   } else {
2228     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2229   }
2230 
2231   if (trip_count <= nteams) {
2232     KMP_DEBUG_ASSERT(
2233         __kmp_static == kmp_sch_static_greedy ||
2234         __kmp_static ==
2235             kmp_sch_static_balanced); // Unknown static scheduling type.
2236     // only some teams get single iteration, others get nothing
2237     if (team_id < trip_count) {
2238       *pupper = *plower = *plower + team_id * incr;
2239     } else {
2240       *plower = *pupper + incr; // zero-trip loop
2241     }
2242     if (plastiter != NULL)
2243       *plastiter = (team_id == trip_count - 1);
2244   } else {
2245     if (__kmp_static == kmp_sch_static_balanced) {
2246       UT chunk = trip_count / nteams;
2247       UT extras = trip_count % nteams;
2248       *plower +=
2249           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2250       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2251       if (plastiter != NULL)
2252         *plastiter = (team_id == nteams - 1);
2253     } else {
2254       T chunk_inc_count =
2255           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2256       T upper = *pupper;
2257       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2258       // Unknown static scheduling type.
2259       *plower += team_id * chunk_inc_count;
2260       *pupper = *plower + chunk_inc_count - incr;
2261       // Check/correct bounds if needed
2262       if (incr > 0) {
2263         if (*pupper < *plower)
2264           *pupper = traits_t<T>::max_value;
2265         if (plastiter != NULL)
2266           *plastiter = *plower <= upper && *pupper > upper - incr;
2267         if (*pupper > upper)
2268           *pupper = upper; // tracker C73258
2269       } else {
2270         if (*pupper > *plower)
2271           *pupper = traits_t<T>::min_value;
2272         if (plastiter != NULL)
2273           *plastiter = *plower >= upper && *pupper < upper - incr;
2274         if (*pupper < upper)
2275           *pupper = upper; // tracker C73258
2276       }
2277     }
2278   }
2279 }
2280 
2281 //-----------------------------------------------------------------------------
2282 // Dispatch routines
2283 //    Transfer call to template< type T >
2284 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2285 //                         T lb, T ub, ST st, ST chunk )
2286 extern "C" {
2287 
2288 /*!
2289 @ingroup WORK_SHARING
2290 @{
2291 @param loc Source location
2292 @param gtid Global thread id
2293 @param schedule Schedule type
2294 @param lb  Lower bound
2295 @param ub  Upper bound
2296 @param st  Step (or increment if you prefer)
2297 @param chunk The chunk size to block with
2298 
2299 This function prepares the runtime to start a dynamically scheduled for loop,
2300 saving the loop arguments.
2301 These functions are all identical apart from the types of the arguments.
2302 */
2303 
2304 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2305                             enum sched_type schedule, kmp_int32 lb,
2306                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2307   KMP_DEBUG_ASSERT(__kmp_init_serial);
2308 #if OMPT_SUPPORT && OMPT_OPTIONAL
2309   OMPT_STORE_RETURN_ADDRESS(gtid);
2310 #endif
2311   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2312 }
2313 /*!
2314 See @ref __kmpc_dispatch_init_4
2315 */
2316 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2317                              enum sched_type schedule, kmp_uint32 lb,
2318                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2319   KMP_DEBUG_ASSERT(__kmp_init_serial);
2320 #if OMPT_SUPPORT && OMPT_OPTIONAL
2321   OMPT_STORE_RETURN_ADDRESS(gtid);
2322 #endif
2323   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2324 }
2325 
2326 /*!
2327 See @ref __kmpc_dispatch_init_4
2328 */
2329 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2330                             enum sched_type schedule, kmp_int64 lb,
2331                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2332   KMP_DEBUG_ASSERT(__kmp_init_serial);
2333 #if OMPT_SUPPORT && OMPT_OPTIONAL
2334   OMPT_STORE_RETURN_ADDRESS(gtid);
2335 #endif
2336   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2337 }
2338 
2339 /*!
2340 See @ref __kmpc_dispatch_init_4
2341 */
2342 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2343                              enum sched_type schedule, kmp_uint64 lb,
2344                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2345   KMP_DEBUG_ASSERT(__kmp_init_serial);
2346 #if OMPT_SUPPORT && OMPT_OPTIONAL
2347   OMPT_STORE_RETURN_ADDRESS(gtid);
2348 #endif
2349   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2350 }
2351 
2352 /*!
2353 See @ref __kmpc_dispatch_init_4
2354 
2355 Difference from __kmpc_dispatch_init set of functions is these functions
2356 are called for composite distribute parallel for construct. Thus before
2357 regular iterations dispatching we need to calc per-team iteration space.
2358 
2359 These functions are all identical apart from the types of the arguments.
2360 */
2361 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2362                                  enum sched_type schedule, kmp_int32 *p_last,
2363                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2364                                  kmp_int32 chunk) {
2365   KMP_DEBUG_ASSERT(__kmp_init_serial);
2366 #if OMPT_SUPPORT && OMPT_OPTIONAL
2367   OMPT_STORE_RETURN_ADDRESS(gtid);
2368 #endif
2369   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2370   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2371 }
2372 
2373 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2374                                   enum sched_type schedule, kmp_int32 *p_last,
2375                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2376                                   kmp_int32 chunk) {
2377   KMP_DEBUG_ASSERT(__kmp_init_serial);
2378 #if OMPT_SUPPORT && OMPT_OPTIONAL
2379   OMPT_STORE_RETURN_ADDRESS(gtid);
2380 #endif
2381   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2382   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2383 }
2384 
2385 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2386                                  enum sched_type schedule, kmp_int32 *p_last,
2387                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2388                                  kmp_int64 chunk) {
2389   KMP_DEBUG_ASSERT(__kmp_init_serial);
2390 #if OMPT_SUPPORT && OMPT_OPTIONAL
2391   OMPT_STORE_RETURN_ADDRESS(gtid);
2392 #endif
2393   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2394   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2395 }
2396 
2397 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2398                                   enum sched_type schedule, kmp_int32 *p_last,
2399                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2400                                   kmp_int64 chunk) {
2401   KMP_DEBUG_ASSERT(__kmp_init_serial);
2402 #if OMPT_SUPPORT && OMPT_OPTIONAL
2403   OMPT_STORE_RETURN_ADDRESS(gtid);
2404 #endif
2405   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2406   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2407 }
2408 
2409 /*!
2410 @param loc Source code location
2411 @param gtid Global thread id
2412 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2413 otherwise
2414 @param p_lb   Pointer to the lower bound for the next chunk of work
2415 @param p_ub   Pointer to the upper bound for the next chunk of work
2416 @param p_st   Pointer to the stride for the next chunk of work
2417 @return one if there is work to be done, zero otherwise
2418 
2419 Get the next dynamically allocated chunk of work for this thread.
2420 If there is no more work, then the lb,ub and stride need not be modified.
2421 */
2422 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2423                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2424 #if OMPT_SUPPORT && OMPT_OPTIONAL
2425   OMPT_STORE_RETURN_ADDRESS(gtid);
2426 #endif
2427   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2428 #if OMPT_SUPPORT && OMPT_OPTIONAL
2429                                         ,
2430                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2431 #endif
2432                                             );
2433 }
2434 
2435 /*!
2436 See @ref __kmpc_dispatch_next_4
2437 */
2438 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2439                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2440                             kmp_int32 *p_st) {
2441 #if OMPT_SUPPORT && OMPT_OPTIONAL
2442   OMPT_STORE_RETURN_ADDRESS(gtid);
2443 #endif
2444   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2445 #if OMPT_SUPPORT && OMPT_OPTIONAL
2446                                          ,
2447                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2448 #endif
2449                                              );
2450 }
2451 
2452 /*!
2453 See @ref __kmpc_dispatch_next_4
2454 */
2455 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2456                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2457 #if OMPT_SUPPORT && OMPT_OPTIONAL
2458   OMPT_STORE_RETURN_ADDRESS(gtid);
2459 #endif
2460   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2461 #if OMPT_SUPPORT && OMPT_OPTIONAL
2462                                         ,
2463                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2464 #endif
2465                                             );
2466 }
2467 
2468 /*!
2469 See @ref __kmpc_dispatch_next_4
2470 */
2471 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2472                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2473                             kmp_int64 *p_st) {
2474 #if OMPT_SUPPORT && OMPT_OPTIONAL
2475   OMPT_STORE_RETURN_ADDRESS(gtid);
2476 #endif
2477   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2478 #if OMPT_SUPPORT && OMPT_OPTIONAL
2479                                          ,
2480                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2481 #endif
2482                                              );
2483 }
2484 
2485 /*!
2486 @param loc Source code location
2487 @param gtid Global thread id
2488 
2489 Mark the end of a dynamic loop.
2490 */
2491 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2492   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2493 }
2494 
2495 /*!
2496 See @ref __kmpc_dispatch_fini_4
2497 */
2498 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2499   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2500 }
2501 
2502 /*!
2503 See @ref __kmpc_dispatch_fini_4
2504 */
2505 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2506   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2507 }
2508 
2509 /*!
2510 See @ref __kmpc_dispatch_fini_4
2511 */
2512 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2513   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2514 }
2515 /*! @} */
2516 
2517 //-----------------------------------------------------------------------------
2518 // Non-template routines from kmp_dispatch.cpp used in other sources
2519 
2520 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2521   return value == checker;
2522 }
2523 
2524 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2525   return value != checker;
2526 }
2527 
2528 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2529   return value < checker;
2530 }
2531 
2532 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2533   return value >= checker;
2534 }
2535 
2536 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2537   return value <= checker;
2538 }
2539 
2540 kmp_uint32
2541 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2542              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2543              void *obj // Higher-level synchronization object, or NULL.
2544              ) {
2545   // note: we may not belong to a team at this point
2546   volatile kmp_uint32 *spin = spinner;
2547   kmp_uint32 check = checker;
2548   kmp_uint32 spins;
2549   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2550   kmp_uint32 r;
2551 
2552   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2553   KMP_INIT_YIELD(spins);
2554   // main wait spin loop
2555   while (!f(r = TCR_4(*spin), check)) {
2556     KMP_FSYNC_SPIN_PREPARE(obj);
2557     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2558        split. It causes problems with infinite recursion because of exit lock */
2559     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2560         __kmp_abort_thread(); */
2561     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2562   }
2563   KMP_FSYNC_SPIN_ACQUIRED(obj);
2564   return r;
2565 }
2566 
2567 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2568                       kmp_uint32 (*pred)(void *, kmp_uint32),
2569                       void *obj // Higher-level synchronization object, or NULL.
2570                       ) {
2571   // note: we may not belong to a team at this point
2572   void *spin = spinner;
2573   kmp_uint32 check = checker;
2574   kmp_uint32 spins;
2575   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2576 
2577   KMP_FSYNC_SPIN_INIT(obj, spin);
2578   KMP_INIT_YIELD(spins);
2579   // main wait spin loop
2580   while (!f(spin, check)) {
2581     KMP_FSYNC_SPIN_PREPARE(obj);
2582     /* if we have waited a bit, or are noversubscribed, yield */
2583     /* pause is in the following code */
2584     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2585   }
2586   KMP_FSYNC_SPIN_ACQUIRED(obj);
2587 }
2588 
2589 } // extern "C"
2590 
2591 #ifdef KMP_GOMP_COMPAT
2592 
2593 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2594                                enum sched_type schedule, kmp_int32 lb,
2595                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2596                                int push_ws) {
2597   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2598                                  push_ws);
2599 }
2600 
2601 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2602                                 enum sched_type schedule, kmp_uint32 lb,
2603                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2604                                 int push_ws) {
2605   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2606                                   push_ws);
2607 }
2608 
2609 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2610                                enum sched_type schedule, kmp_int64 lb,
2611                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2612                                int push_ws) {
2613   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2614                                  push_ws);
2615 }
2616 
2617 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2618                                 enum sched_type schedule, kmp_uint64 lb,
2619                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2620                                 int push_ws) {
2621   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2622                                   push_ws);
2623 }
2624 
2625 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2626   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2627 }
2628 
2629 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2630   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2631 }
2632 
2633 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2634   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2635 }
2636 
2637 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2638   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2639 }
2640 
2641 #endif /* KMP_GOMP_COMPAT */
2642 
2643 /* ------------------------------------------------------------------------ */
2644