xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_dispatch.cpp (revision 9e5787d2284e187abb5b654d924394a65772e004)
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(enum sched_type schedule,
73                                          bool use_hier = false) {
74   // Pick up the nonmonotonic/monotonic bits from the scheduling type
75   int monotonicity;
76   // default to monotonic
77   monotonicity = SCHEDULE_MONOTONIC;
78   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
79     monotonicity = SCHEDULE_NONMONOTONIC;
80   else if (SCHEDULE_HAS_MONOTONIC(schedule))
81     monotonicity = SCHEDULE_MONOTONIC;
82   return monotonicity;
83 }
84 
85 // Initialize a dispatch_private_info_template<T> buffer for a particular
86 // type of schedule,chunk.  The loop description is found in lb (lower bound),
87 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
88 // to the scheduling (often the number of threads in a team, but not always if
89 // hierarchical scheduling is used).  tid is the id of the thread calling
90 // the function within the group of nproc threads.  It will have a value
91 // between 0 and nproc - 1.  This is often just the thread id within a team, but
92 // is not necessarily the case when using hierarchical scheduling.
93 // loc is the source file location of the corresponding loop
94 // gtid is the global thread id
95 template <typename T>
96 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
97                                    dispatch_private_info_template<T> *pr,
98                                    enum sched_type schedule, T lb, T ub,
99                                    typename traits_t<T>::signed_t st,
100 #if USE_ITT_BUILD
101                                    kmp_uint64 *cur_chunk,
102 #endif
103                                    typename traits_t<T>::signed_t chunk,
104                                    T nproc, T tid) {
105   typedef typename traits_t<T>::unsigned_t UT;
106   typedef typename traits_t<T>::floating_t DBL;
107 
108   int active;
109   T tc;
110   kmp_info_t *th;
111   kmp_team_t *team;
112   int monotonicity;
113   bool use_hier;
114 
115 #ifdef KMP_DEBUG
116   typedef typename traits_t<T>::signed_t ST;
117   {
118     char *buff;
119     // create format specifiers before the debug output
120     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
121                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
122                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
123                             traits_t<T>::spec, traits_t<T>::spec,
124                             traits_t<ST>::spec, traits_t<ST>::spec,
125                             traits_t<T>::spec, traits_t<T>::spec);
126     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
127     __kmp_str_free(&buff);
128   }
129 #endif
130   /* setup data */
131   th = __kmp_threads[gtid];
132   team = th->th.th_team;
133   active = !team->t.t_serialized;
134 
135 #if USE_ITT_BUILD
136   int itt_need_metadata_reporting =
137       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
138       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
139       team->t.t_active_level == 1;
140 #endif
141 
142 #if KMP_USE_HIER_SCHED
143   use_hier = pr->flags.use_hier;
144 #else
145   use_hier = false;
146 #endif
147 
148   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
149   monotonicity = __kmp_get_monotonicity(schedule, use_hier);
150   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
151 
152   /* Pick up the nomerge/ordered bits from the scheduling type */
153   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
154     pr->flags.nomerge = TRUE;
155     schedule =
156         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
157   } else {
158     pr->flags.nomerge = FALSE;
159   }
160   pr->type_size = traits_t<T>::type_size; // remember the size of variables
161   if (kmp_ord_lower & schedule) {
162     pr->flags.ordered = TRUE;
163     schedule =
164         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
165   } else {
166     pr->flags.ordered = FALSE;
167   }
168   // Ordered overrides nonmonotonic
169   if (pr->flags.ordered) {
170     monotonicity = SCHEDULE_MONOTONIC;
171   }
172 
173   if (schedule == kmp_sch_static) {
174     schedule = __kmp_static;
175   } else {
176     if (schedule == kmp_sch_runtime) {
177       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
178       // not specified)
179       schedule = team->t.t_sched.r_sched_type;
180       monotonicity = __kmp_get_monotonicity(schedule, use_hier);
181       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
182       // Detail the schedule if needed (global controls are differentiated
183       // appropriately)
184       if (schedule == kmp_sch_guided_chunked) {
185         schedule = __kmp_guided;
186       } else if (schedule == kmp_sch_static) {
187         schedule = __kmp_static;
188       }
189       // Use the chunk size specified by OMP_SCHEDULE (or default if not
190       // specified)
191       chunk = team->t.t_sched.chunk;
192 #if USE_ITT_BUILD
193       if (cur_chunk)
194         *cur_chunk = chunk;
195 #endif
196 #ifdef KMP_DEBUG
197       {
198         char *buff;
199         // create format specifiers before the debug output
200         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
201                                 "schedule:%%d chunk:%%%s\n",
202                                 traits_t<ST>::spec);
203         KD_TRACE(10, (buff, gtid, schedule, chunk));
204         __kmp_str_free(&buff);
205       }
206 #endif
207     } else {
208       if (schedule == kmp_sch_guided_chunked) {
209         schedule = __kmp_guided;
210       }
211       if (chunk <= 0) {
212         chunk = KMP_DEFAULT_CHUNK;
213       }
214     }
215 
216     if (schedule == kmp_sch_auto) {
217       // mapping and differentiation: in the __kmp_do_serial_initialize()
218       schedule = __kmp_auto;
219 #ifdef KMP_DEBUG
220       {
221         char *buff;
222         // create format specifiers before the debug output
223         buff = __kmp_str_format(
224             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
225             "schedule:%%d chunk:%%%s\n",
226             traits_t<ST>::spec);
227         KD_TRACE(10, (buff, gtid, schedule, chunk));
228         __kmp_str_free(&buff);
229       }
230 #endif
231     }
232 #if KMP_STATIC_STEAL_ENABLED
233     // map nonmonotonic:dynamic to static steal
234     if (schedule == kmp_sch_dynamic_chunked) {
235       if (monotonicity == SCHEDULE_NONMONOTONIC)
236         schedule = kmp_sch_static_steal;
237     }
238 #endif
239     /* guided analytical not safe for too many threads */
240     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
241       schedule = kmp_sch_guided_iterative_chunked;
242       KMP_WARNING(DispatchManyThreads);
243     }
244     if (schedule == kmp_sch_runtime_simd) {
245       // compiler provides simd_width in the chunk parameter
246       schedule = team->t.t_sched.r_sched_type;
247       monotonicity = __kmp_get_monotonicity(schedule, use_hier);
248       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
249       // Detail the schedule if needed (global controls are differentiated
250       // appropriately)
251       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
252           schedule == __kmp_static) {
253         schedule = kmp_sch_static_balanced_chunked;
254       } else {
255         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
256           schedule = kmp_sch_guided_simd;
257         }
258         chunk = team->t.t_sched.chunk * chunk;
259       }
260 #if USE_ITT_BUILD
261       if (cur_chunk)
262         *cur_chunk = chunk;
263 #endif
264 #ifdef KMP_DEBUG
265       {
266         char *buff;
267         // create format specifiers before the debug output
268         buff = __kmp_str_format(
269             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
270             " chunk:%%%s\n",
271             traits_t<ST>::spec);
272         KD_TRACE(10, (buff, gtid, schedule, chunk));
273         __kmp_str_free(&buff);
274       }
275 #endif
276     }
277     pr->u.p.parm1 = chunk;
278   }
279   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
280               "unknown scheduling type");
281 
282   pr->u.p.count = 0;
283 
284   if (__kmp_env_consistency_check) {
285     if (st == 0) {
286       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
287                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
288     }
289   }
290   // compute trip count
291   if (st == 1) { // most common case
292     if (ub >= lb) {
293       tc = ub - lb + 1;
294     } else { // ub < lb
295       tc = 0; // zero-trip
296     }
297   } else if (st < 0) {
298     if (lb >= ub) {
299       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
300       // where the division needs to be unsigned regardless of the result type
301       tc = (UT)(lb - ub) / (-st) + 1;
302     } else { // lb < ub
303       tc = 0; // zero-trip
304     }
305   } else { // st > 0
306     if (ub >= lb) {
307       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
308       // where the division needs to be unsigned regardless of the result type
309       tc = (UT)(ub - lb) / st + 1;
310     } else { // ub < lb
311       tc = 0; // zero-trip
312     }
313   }
314 
315 #if KMP_STATS_ENABLED
316   if (KMP_MASTER_GTID(gtid)) {
317     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
318   }
319 #endif
320 
321   pr->u.p.lb = lb;
322   pr->u.p.ub = ub;
323   pr->u.p.st = st;
324   pr->u.p.tc = tc;
325 
326 #if KMP_OS_WINDOWS
327   pr->u.p.last_upper = ub + st;
328 #endif /* KMP_OS_WINDOWS */
329 
330   /* NOTE: only the active parallel region(s) has active ordered sections */
331 
332   if (active) {
333     if (pr->flags.ordered) {
334       pr->ordered_bumped = 0;
335       pr->u.p.ordered_lower = 1;
336       pr->u.p.ordered_upper = 0;
337     }
338   }
339 
340   switch (schedule) {
341 #if (KMP_STATIC_STEAL_ENABLED)
342   case kmp_sch_static_steal: {
343     T ntc, init;
344 
345     KD_TRACE(100,
346              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
347               gtid));
348 
349     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
350     if (nproc > 1 && ntc >= nproc) {
351       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
352       T id = tid;
353       T small_chunk, extras;
354 
355       small_chunk = ntc / nproc;
356       extras = ntc % nproc;
357 
358       init = id * small_chunk + (id < extras ? id : extras);
359       pr->u.p.count = init;
360       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
361 
362       pr->u.p.parm2 = lb;
363       // parm3 is the number of times to attempt stealing which is
364       // proportional to the number of chunks per thread up until
365       // the maximum value of nproc.
366       pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
367       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
368       pr->u.p.st = st;
369       if (traits_t<T>::type_size > 4) {
370         // AC: TODO: check if 16-byte CAS available and use it to
371         // improve performance (probably wait for explicit request
372         // before spending time on this).
373         // For now use dynamically allocated per-thread lock,
374         // free memory in __kmp_dispatch_next when status==0.
375         KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
376         pr->u.p.th_steal_lock =
377             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
378         __kmp_init_lock(pr->u.p.th_steal_lock);
379       }
380       break;
381     } else {
382       /* too few chunks: switching to kmp_sch_dynamic_chunked */
383       schedule = kmp_sch_dynamic_chunked;
384       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
385                      "kmp_sch_dynamic_chunked\n",
386                       gtid));
387       if (pr->u.p.parm1 <= 0)
388         pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
389       break;
390     } // if
391   } // case
392 #endif
393   case kmp_sch_static_balanced: {
394     T init, limit;
395 
396     KD_TRACE(
397         100,
398         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
399          gtid));
400 
401     if (nproc > 1) {
402       T id = tid;
403 
404       if (tc < nproc) {
405         if (id < tc) {
406           init = id;
407           limit = id;
408           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
409         } else {
410           pr->u.p.count = 1; /* means no more chunks to execute */
411           pr->u.p.parm1 = FALSE;
412           break;
413         }
414       } else {
415         T small_chunk = tc / nproc;
416         T extras = tc % nproc;
417         init = id * small_chunk + (id < extras ? id : extras);
418         limit = init + small_chunk - (id < extras ? 0 : 1);
419         pr->u.p.parm1 = (id == nproc - 1);
420       }
421     } else {
422       if (tc > 0) {
423         init = 0;
424         limit = tc - 1;
425         pr->u.p.parm1 = TRUE;
426       } else {
427         // zero trip count
428         pr->u.p.count = 1; /* means no more chunks to execute */
429         pr->u.p.parm1 = FALSE;
430         break;
431       }
432     }
433 #if USE_ITT_BUILD
434     // Calculate chunk for metadata report
435     if (itt_need_metadata_reporting)
436       if (cur_chunk)
437         *cur_chunk = limit - init + 1;
438 #endif
439     if (st == 1) {
440       pr->u.p.lb = lb + init;
441       pr->u.p.ub = lb + limit;
442     } else {
443       // calculated upper bound, "ub" is user-defined upper bound
444       T ub_tmp = lb + limit * st;
445       pr->u.p.lb = lb + init * st;
446       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
447       // it exactly
448       if (st > 0) {
449         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
450       } else {
451         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
452       }
453     }
454     if (pr->flags.ordered) {
455       pr->u.p.ordered_lower = init;
456       pr->u.p.ordered_upper = limit;
457     }
458     break;
459   } // case
460   case kmp_sch_static_balanced_chunked: {
461     // similar to balanced, but chunk adjusted to multiple of simd width
462     T nth = nproc;
463     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
464                    " -> falling-through to static_greedy\n",
465                    gtid));
466     schedule = kmp_sch_static_greedy;
467     if (nth > 1)
468       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
469     else
470       pr->u.p.parm1 = tc;
471     break;
472   } // case
473   case kmp_sch_guided_simd:
474   case kmp_sch_guided_iterative_chunked: {
475     KD_TRACE(
476         100,
477         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
478          " case\n",
479          gtid));
480 
481     if (nproc > 1) {
482       if ((2L * chunk + 1) * nproc >= tc) {
483         /* chunk size too large, switch to dynamic */
484         schedule = kmp_sch_dynamic_chunked;
485       } else {
486         // when remaining iters become less than parm2 - switch to dynamic
487         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
488         *(double *)&pr->u.p.parm3 =
489             guided_flt_param / nproc; // may occupy parm3 and parm4
490       }
491     } else {
492       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
493                      "kmp_sch_static_greedy\n",
494                      gtid));
495       schedule = kmp_sch_static_greedy;
496       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
497       KD_TRACE(
498           100,
499           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
500            gtid));
501       pr->u.p.parm1 = tc;
502     } // if
503   } // case
504   break;
505   case kmp_sch_guided_analytical_chunked: {
506     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
507                    "kmp_sch_guided_analytical_chunked case\n",
508                    gtid));
509 
510     if (nproc > 1) {
511       if ((2L * chunk + 1) * nproc >= tc) {
512         /* chunk size too large, switch to dynamic */
513         schedule = kmp_sch_dynamic_chunked;
514       } else {
515         /* commonly used term: (2 nproc - 1)/(2 nproc) */
516         DBL x;
517 
518 #if KMP_USE_X87CONTROL
519         /* Linux* OS already has 64-bit computation by default for long double,
520            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
521            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
522            instead of the default 53-bit. Even though long double doesn't work
523            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
524            expected to impact the correctness of the algorithm, but this has not
525            been mathematically proven. */
526         // save original FPCW and set precision to 64-bit, as
527         // Windows* OS on IA-32 architecture defaults to 53-bit
528         unsigned int oldFpcw = _control87(0, 0);
529         _control87(_PC_64, _MCW_PC); // 0,0x30000
530 #endif
531         /* value used for comparison in solver for cross-over point */
532         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
533 
534         /* crossover point--chunk indexes equal to or greater than
535            this point switch to dynamic-style scheduling */
536         UT cross;
537 
538         /* commonly used term: (2 nproc - 1)/(2 nproc) */
539         x = (long double)1.0 - (long double)0.5 / nproc;
540 
541 #ifdef KMP_DEBUG
542         { // test natural alignment
543           struct _test_a {
544             char a;
545             union {
546               char b;
547               DBL d;
548             };
549           } t;
550           ptrdiff_t natural_alignment =
551               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
552           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
553           // long)natural_alignment );
554           KMP_DEBUG_ASSERT(
555               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
556         }
557 #endif // KMP_DEBUG
558 
559         /* save the term in thread private dispatch structure */
560         *(DBL *)&pr->u.p.parm3 = x;
561 
562         /* solve for the crossover point to the nearest integer i for which C_i
563            <= chunk */
564         {
565           UT left, right, mid;
566           long double p;
567 
568           /* estimate initial upper and lower bound */
569 
570           /* doesn't matter what value right is as long as it is positive, but
571              it affects performance of the solver */
572           right = 229;
573           p = __kmp_pow<UT>(x, right);
574           if (p > target) {
575             do {
576               p *= p;
577               right <<= 1;
578             } while (p > target && right < (1 << 27));
579             /* lower bound is previous (failed) estimate of upper bound */
580             left = right >> 1;
581           } else {
582             left = 0;
583           }
584 
585           /* bisection root-finding method */
586           while (left + 1 < right) {
587             mid = (left + right) / 2;
588             if (__kmp_pow<UT>(x, mid) > target) {
589               left = mid;
590             } else {
591               right = mid;
592             }
593           } // while
594           cross = right;
595         }
596         /* assert sanity of computed crossover point */
597         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
598                    __kmp_pow<UT>(x, cross) <= target);
599 
600         /* save the crossover point in thread private dispatch structure */
601         pr->u.p.parm2 = cross;
602 
603 // C75803
604 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
605 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
606 #else
607 #define GUIDED_ANALYTICAL_WORKAROUND (x)
608 #endif
609         /* dynamic-style scheduling offset */
610         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
611                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
612                         cross * chunk;
613 #if KMP_USE_X87CONTROL
614         // restore FPCW
615         _control87(oldFpcw, _MCW_PC);
616 #endif
617       } // if
618     } else {
619       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
620                      "kmp_sch_static_greedy\n",
621                      gtid));
622       schedule = kmp_sch_static_greedy;
623       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
624       pr->u.p.parm1 = tc;
625     } // if
626   } // case
627   break;
628   case kmp_sch_static_greedy:
629     KD_TRACE(
630         100,
631         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
632          gtid));
633     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
634     break;
635   case kmp_sch_static_chunked:
636   case kmp_sch_dynamic_chunked:
637     if (pr->u.p.parm1 <= 0) {
638       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
639     }
640     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
641                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
642                    gtid));
643     break;
644   case kmp_sch_trapezoidal: {
645     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
646 
647     T parm1, parm2, parm3, parm4;
648     KD_TRACE(100,
649              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
650               gtid));
651 
652     parm1 = chunk;
653 
654     /* F : size of the first cycle */
655     parm2 = (tc / (2 * nproc));
656 
657     if (parm2 < 1) {
658       parm2 = 1;
659     }
660 
661     /* L : size of the last cycle.  Make sure the last cycle is not larger
662        than the first cycle. */
663     if (parm1 < 1) {
664       parm1 = 1;
665     } else if (parm1 > parm2) {
666       parm1 = parm2;
667     }
668 
669     /* N : number of cycles */
670     parm3 = (parm2 + parm1);
671     parm3 = (2 * tc + parm3 - 1) / parm3;
672 
673     if (parm3 < 2) {
674       parm3 = 2;
675     }
676 
677     /* sigma : decreasing incr of the trapezoid */
678     parm4 = (parm3 - 1);
679     parm4 = (parm2 - parm1) / parm4;
680 
681     // pointless check, because parm4 >= 0 always
682     // if ( parm4 < 0 ) {
683     //    parm4 = 0;
684     //}
685 
686     pr->u.p.parm1 = parm1;
687     pr->u.p.parm2 = parm2;
688     pr->u.p.parm3 = parm3;
689     pr->u.p.parm4 = parm4;
690   } // case
691   break;
692 
693   default: {
694     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
695                 KMP_HNT(GetNewerLibrary), // Hint
696                 __kmp_msg_null // Variadic argument list terminator
697                 );
698   } break;
699   } // switch
700   pr->schedule = schedule;
701 }
702 
703 #if KMP_USE_HIER_SCHED
704 template <typename T>
705 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
706                                              typename traits_t<T>::signed_t st);
707 template <>
708 inline void
709 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
710                                             kmp_int32 ub, kmp_int32 st) {
711   __kmp_dispatch_init_hierarchy<kmp_int32>(
712       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
713       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
714 }
715 template <>
716 inline void
717 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
718                                              kmp_uint32 ub, kmp_int32 st) {
719   __kmp_dispatch_init_hierarchy<kmp_uint32>(
720       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
721       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
722 }
723 template <>
724 inline void
725 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
726                                             kmp_int64 ub, kmp_int64 st) {
727   __kmp_dispatch_init_hierarchy<kmp_int64>(
728       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
729       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
730 }
731 template <>
732 inline void
733 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
734                                              kmp_uint64 ub, kmp_int64 st) {
735   __kmp_dispatch_init_hierarchy<kmp_uint64>(
736       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
737       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
738 }
739 
740 // free all the hierarchy scheduling memory associated with the team
741 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
742   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
743   for (int i = 0; i < num_disp_buff; ++i) {
744     // type does not matter here so use kmp_int32
745     auto sh =
746         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
747             &team->t.t_disp_buffer[i]);
748     if (sh->hier) {
749       sh->hier->deallocate();
750       __kmp_free(sh->hier);
751     }
752   }
753 }
754 #endif
755 
756 // UT - unsigned flavor of T, ST - signed flavor of T,
757 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
758 template <typename T>
759 static void
760 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
761                     T ub, typename traits_t<T>::signed_t st,
762                     typename traits_t<T>::signed_t chunk, int push_ws) {
763   typedef typename traits_t<T>::unsigned_t UT;
764 
765   int active;
766   kmp_info_t *th;
767   kmp_team_t *team;
768   kmp_uint32 my_buffer_index;
769   dispatch_private_info_template<T> *pr;
770   dispatch_shared_info_template<T> volatile *sh;
771 
772   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
773                    sizeof(dispatch_private_info));
774   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
775                    sizeof(dispatch_shared_info));
776 
777   if (!TCR_4(__kmp_init_parallel))
778     __kmp_parallel_initialize();
779 
780   __kmp_resume_if_soft_paused();
781 
782 #if INCLUDE_SSC_MARKS
783   SSC_MARK_DISPATCH_INIT();
784 #endif
785 #ifdef KMP_DEBUG
786   typedef typename traits_t<T>::signed_t ST;
787   {
788     char *buff;
789     // create format specifiers before the debug output
790     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
791                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
792                             traits_t<ST>::spec, traits_t<T>::spec,
793                             traits_t<T>::spec, traits_t<ST>::spec);
794     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
795     __kmp_str_free(&buff);
796   }
797 #endif
798   /* setup data */
799   th = __kmp_threads[gtid];
800   team = th->th.th_team;
801   active = !team->t.t_serialized;
802   th->th.th_ident = loc;
803 
804   // Any half-decent optimizer will remove this test when the blocks are empty
805   // since the macros expand to nothing
806   // when statistics are disabled.
807   if (schedule == __kmp_static) {
808     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
809   } else {
810     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
811   }
812 
813 #if KMP_USE_HIER_SCHED
814   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
815   // Hierarchical scheduling does not work with ordered, so if ordered is
816   // detected, then revert back to threaded scheduling.
817   bool ordered;
818   enum sched_type my_sched = schedule;
819   my_buffer_index = th->th.th_dispatch->th_disp_index;
820   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
821       &th->th.th_dispatch
822            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
823   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
824   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
825     my_sched =
826         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
827   ordered = (kmp_ord_lower & my_sched);
828   if (pr->flags.use_hier) {
829     if (ordered) {
830       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
831                      "Disabling hierarchical scheduling.\n",
832                      gtid));
833       pr->flags.use_hier = FALSE;
834     }
835   }
836   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
837     // Don't use hierarchical for ordered parallel loops and don't
838     // use the runtime hierarchy if one was specified in the program
839     if (!ordered && !pr->flags.use_hier)
840       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
841   }
842 #endif // KMP_USE_HIER_SCHED
843 
844 #if USE_ITT_BUILD
845   kmp_uint64 cur_chunk = chunk;
846   int itt_need_metadata_reporting =
847       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
848       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
849       team->t.t_active_level == 1;
850 #endif
851   if (!active) {
852     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
853         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
854   } else {
855     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
856                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
857 
858     my_buffer_index = th->th.th_dispatch->th_disp_index++;
859 
860     /* What happens when number of threads changes, need to resize buffer? */
861     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
862         &th->th.th_dispatch
863              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
864     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
865         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
866     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
867                   my_buffer_index));
868   }
869 
870   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
871 #if USE_ITT_BUILD
872                                 &cur_chunk,
873 #endif
874                                 chunk, (T)th->th.th_team_nproc,
875                                 (T)th->th.th_info.ds.ds_tid);
876   if (active) {
877     if (pr->flags.ordered == 0) {
878       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
879       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
880     } else {
881       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
882       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
883     }
884   }
885 
886   if (active) {
887     /* The name of this buffer should be my_buffer_index when it's free to use
888      * it */
889 
890     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
891                    "sh->buffer_index:%d\n",
892                    gtid, my_buffer_index, sh->buffer_index));
893     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
894                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
895     // Note: KMP_WAIT() cannot be used there: buffer index and
896     // my_buffer_index are *always* 32-bit integers.
897     KMP_MB(); /* is this necessary? */
898     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
899                    "sh->buffer_index:%d\n",
900                    gtid, my_buffer_index, sh->buffer_index));
901 
902     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
903     th->th.th_dispatch->th_dispatch_sh_current =
904         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
905 #if USE_ITT_BUILD
906     if (pr->flags.ordered) {
907       __kmp_itt_ordered_init(gtid);
908     }
909     // Report loop metadata
910     if (itt_need_metadata_reporting) {
911       // Only report metadata by master of active team at level 1
912       kmp_uint64 schedtype = 0;
913       switch (schedule) {
914       case kmp_sch_static_chunked:
915       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
916         break;
917       case kmp_sch_static_greedy:
918         cur_chunk = pr->u.p.parm1;
919         break;
920       case kmp_sch_dynamic_chunked:
921         schedtype = 1;
922         break;
923       case kmp_sch_guided_iterative_chunked:
924       case kmp_sch_guided_analytical_chunked:
925       case kmp_sch_guided_simd:
926         schedtype = 2;
927         break;
928       default:
929         // Should we put this case under "static"?
930         // case kmp_sch_static_steal:
931         schedtype = 3;
932         break;
933       }
934       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
935     }
936 #if KMP_USE_HIER_SCHED
937     if (pr->flags.use_hier) {
938       pr->u.p.count = 0;
939       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
940     }
941 #endif // KMP_USER_HIER_SCHED
942 #endif /* USE_ITT_BUILD */
943   }
944 
945 #ifdef KMP_DEBUG
946   {
947     char *buff;
948     // create format specifiers before the debug output
949     buff = __kmp_str_format(
950         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
951         "lb:%%%s ub:%%%s"
952         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
953         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
954         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
955         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
956         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
957         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
958     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
959                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
960                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
961                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
962     __kmp_str_free(&buff);
963   }
964 #endif
965 #if (KMP_STATIC_STEAL_ENABLED)
966   // It cannot be guaranteed that after execution of a loop with some other
967   // schedule kind all the parm3 variables will contain the same value. Even if
968   // all parm3 will be the same, it still exists a bad case like using 0 and 1
969   // rather than program life-time increment. So the dedicated variable is
970   // required. The 'static_steal_counter' is used.
971   if (pr->schedule == kmp_sch_static_steal) {
972     // Other threads will inspect this variable when searching for a victim.
973     // This is a flag showing that other threads may steal from this thread
974     // since then.
975     volatile T *p = &pr->u.p.static_steal_counter;
976     *p = *p + 1;
977   }
978 #endif // ( KMP_STATIC_STEAL_ENABLED )
979 
980 #if OMPT_SUPPORT && OMPT_OPTIONAL
981   if (ompt_enabled.ompt_callback_work) {
982     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
983     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
984     ompt_callbacks.ompt_callback(ompt_callback_work)(
985         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
986         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
987   }
988 #endif
989   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
990 }
991 
992 /* For ordered loops, either __kmp_dispatch_finish() should be called after
993  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
994  * every chunk of iterations.  If the ordered section(s) were not executed
995  * for this iteration (or every iteration in this chunk), we need to set the
996  * ordered iteration counters so that the next thread can proceed. */
997 template <typename UT>
998 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
999   typedef typename traits_t<UT>::signed_t ST;
1000   kmp_info_t *th = __kmp_threads[gtid];
1001 
1002   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1003   if (!th->th.th_team->t.t_serialized) {
1004 
1005     dispatch_private_info_template<UT> *pr =
1006         reinterpret_cast<dispatch_private_info_template<UT> *>(
1007             th->th.th_dispatch->th_dispatch_pr_current);
1008     dispatch_shared_info_template<UT> volatile *sh =
1009         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1010             th->th.th_dispatch->th_dispatch_sh_current);
1011     KMP_DEBUG_ASSERT(pr);
1012     KMP_DEBUG_ASSERT(sh);
1013     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1014                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1015 
1016     if (pr->ordered_bumped) {
1017       KD_TRACE(
1018           1000,
1019           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1020            gtid));
1021       pr->ordered_bumped = 0;
1022     } else {
1023       UT lower = pr->u.p.ordered_lower;
1024 
1025 #ifdef KMP_DEBUG
1026       {
1027         char *buff;
1028         // create format specifiers before the debug output
1029         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1030                                 "ordered_iteration:%%%s lower:%%%s\n",
1031                                 traits_t<UT>::spec, traits_t<UT>::spec);
1032         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1033         __kmp_str_free(&buff);
1034       }
1035 #endif
1036 
1037       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1038                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1039       KMP_MB(); /* is this necessary? */
1040 #ifdef KMP_DEBUG
1041       {
1042         char *buff;
1043         // create format specifiers before the debug output
1044         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1045                                 "ordered_iteration:%%%s lower:%%%s\n",
1046                                 traits_t<UT>::spec, traits_t<UT>::spec);
1047         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1048         __kmp_str_free(&buff);
1049       }
1050 #endif
1051 
1052       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1053     } // if
1054   } // if
1055   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1056 }
1057 
1058 #ifdef KMP_GOMP_COMPAT
1059 
1060 template <typename UT>
1061 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1062   typedef typename traits_t<UT>::signed_t ST;
1063   kmp_info_t *th = __kmp_threads[gtid];
1064 
1065   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1066   if (!th->th.th_team->t.t_serialized) {
1067     //        int cid;
1068     dispatch_private_info_template<UT> *pr =
1069         reinterpret_cast<dispatch_private_info_template<UT> *>(
1070             th->th.th_dispatch->th_dispatch_pr_current);
1071     dispatch_shared_info_template<UT> volatile *sh =
1072         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1073             th->th.th_dispatch->th_dispatch_sh_current);
1074     KMP_DEBUG_ASSERT(pr);
1075     KMP_DEBUG_ASSERT(sh);
1076     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1077                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1078 
1079     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1080     UT lower = pr->u.p.ordered_lower;
1081     UT upper = pr->u.p.ordered_upper;
1082     UT inc = upper - lower + 1;
1083 
1084     if (pr->ordered_bumped == inc) {
1085       KD_TRACE(
1086           1000,
1087           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1088            gtid));
1089       pr->ordered_bumped = 0;
1090     } else {
1091       inc -= pr->ordered_bumped;
1092 
1093 #ifdef KMP_DEBUG
1094       {
1095         char *buff;
1096         // create format specifiers before the debug output
1097         buff = __kmp_str_format(
1098             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1099             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1100             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1101         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1102         __kmp_str_free(&buff);
1103       }
1104 #endif
1105 
1106       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1107                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1108 
1109       KMP_MB(); /* is this necessary? */
1110       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1111                       "ordered_bumped to zero\n",
1112                       gtid));
1113       pr->ordered_bumped = 0;
1114 //!!!!! TODO check if the inc should be unsigned, or signed???
1115 #ifdef KMP_DEBUG
1116       {
1117         char *buff;
1118         // create format specifiers before the debug output
1119         buff = __kmp_str_format(
1120             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1121             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1122             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1123             traits_t<UT>::spec);
1124         KD_TRACE(1000,
1125                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1126         __kmp_str_free(&buff);
1127       }
1128 #endif
1129 
1130       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1131     }
1132     //        }
1133   }
1134   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1135 }
1136 
1137 #endif /* KMP_GOMP_COMPAT */
1138 
1139 template <typename T>
1140 int __kmp_dispatch_next_algorithm(int gtid,
1141                                   dispatch_private_info_template<T> *pr,
1142                                   dispatch_shared_info_template<T> volatile *sh,
1143                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1144                                   typename traits_t<T>::signed_t *p_st, T nproc,
1145                                   T tid) {
1146   typedef typename traits_t<T>::unsigned_t UT;
1147   typedef typename traits_t<T>::signed_t ST;
1148   typedef typename traits_t<T>::floating_t DBL;
1149   int status = 0;
1150   kmp_int32 last = 0;
1151   T start;
1152   ST incr;
1153   UT limit, trip, init;
1154   kmp_info_t *th = __kmp_threads[gtid];
1155   kmp_team_t *team = th->th.th_team;
1156 
1157   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1158                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1159   KMP_DEBUG_ASSERT(pr);
1160   KMP_DEBUG_ASSERT(sh);
1161   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1162 #ifdef KMP_DEBUG
1163   {
1164     char *buff;
1165     // create format specifiers before the debug output
1166     buff =
1167         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1168                          "sh:%%p nproc:%%%s tid:%%%s\n",
1169                          traits_t<T>::spec, traits_t<T>::spec);
1170     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1171     __kmp_str_free(&buff);
1172   }
1173 #endif
1174 
1175   // zero trip count
1176   if (pr->u.p.tc == 0) {
1177     KD_TRACE(10,
1178              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1179               "zero status:%d\n",
1180               gtid, status));
1181     return 0;
1182   }
1183 
1184   switch (pr->schedule) {
1185 #if (KMP_STATIC_STEAL_ENABLED)
1186   case kmp_sch_static_steal: {
1187     T chunk = pr->u.p.parm1;
1188 
1189     KD_TRACE(100,
1190              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1191               gtid));
1192 
1193     trip = pr->u.p.tc - 1;
1194 
1195     if (traits_t<T>::type_size > 4) {
1196       // use lock for 8-byte and CAS for 4-byte induction
1197       // variable. TODO (optional): check and use 16-byte CAS
1198       kmp_lock_t *lck = pr->u.p.th_steal_lock;
1199       KMP_DEBUG_ASSERT(lck != NULL);
1200       if (pr->u.p.count < (UT)pr->u.p.ub) {
1201         __kmp_acquire_lock(lck, gtid);
1202         // try to get own chunk of iterations
1203         init = (pr->u.p.count)++;
1204         status = (init < (UT)pr->u.p.ub);
1205         __kmp_release_lock(lck, gtid);
1206       } else {
1207         status = 0; // no own chunks
1208       }
1209       if (!status) { // try to steal
1210         kmp_info_t **other_threads = team->t.t_threads;
1211         int while_limit = pr->u.p.parm3;
1212         int while_index = 0;
1213         T id = pr->u.p.static_steal_counter; // loop id
1214         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1215                   __kmp_dispatch_num_buffers; // current loop index
1216         // note: victim thread can potentially execute another loop
1217         // TODO: algorithm of searching for a victim
1218         // should be cleaned up and measured
1219         while ((!status) && (while_limit != ++while_index)) {
1220           dispatch_private_info_template<T> *victim;
1221           T remaining;
1222           T victimIdx = pr->u.p.parm4;
1223           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1224           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1225               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1226           KMP_DEBUG_ASSERT(victim);
1227           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1228                  oldVictimIdx != victimIdx) {
1229             victimIdx = (victimIdx + 1) % nproc;
1230             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1231                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1232             KMP_DEBUG_ASSERT(victim);
1233           }
1234           if (victim == pr || id != victim->u.p.static_steal_counter) {
1235             continue; // try once more (nproc attempts in total)
1236             // no victim is ready yet to participate in stealing
1237             // because no victim passed kmp_init_dispatch yet
1238           }
1239           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1240             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1241             continue; // not enough chunks to steal, goto next victim
1242           }
1243 
1244           lck = victim->u.p.th_steal_lock;
1245           KMP_ASSERT(lck != NULL);
1246           __kmp_acquire_lock(lck, gtid);
1247           limit = victim->u.p.ub; // keep initial ub
1248           if (victim->u.p.count >= limit ||
1249               (remaining = limit - victim->u.p.count) < 2) {
1250             __kmp_release_lock(lck, gtid);
1251             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1252             continue; // not enough chunks to steal
1253           }
1254           // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
1255           // by 1
1256           if (remaining > 3) {
1257             // steal 1/4 of remaining
1258             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1259             init = (victim->u.p.ub -= (remaining >> 2));
1260           } else {
1261             // steal 1 chunk of 2 or 3 remaining
1262             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1263             init = (victim->u.p.ub -= 1);
1264           }
1265           __kmp_release_lock(lck, gtid);
1266 
1267           KMP_DEBUG_ASSERT(init + 1 <= limit);
1268           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1269           status = 1;
1270           while_index = 0;
1271           // now update own count and ub with stolen range but init chunk
1272           __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
1273           pr->u.p.count = init + 1;
1274           pr->u.p.ub = limit;
1275           __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
1276         } // while (search for victim)
1277       } // if (try to find victim and steal)
1278     } else {
1279       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1280       typedef union {
1281         struct {
1282           UT count;
1283           T ub;
1284         } p;
1285         kmp_int64 b;
1286       } union_i4;
1287       // All operations on 'count' or 'ub' must be combined atomically
1288       // together.
1289       {
1290         union_i4 vold, vnew;
1291         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1292         vnew = vold;
1293         vnew.p.count++;
1294         while (!KMP_COMPARE_AND_STORE_ACQ64(
1295             (volatile kmp_int64 *)&pr->u.p.count,
1296             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1297             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1298           KMP_CPU_PAUSE();
1299           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1300           vnew = vold;
1301           vnew.p.count++;
1302         }
1303         vnew = vold;
1304         init = vnew.p.count;
1305         status = (init < (UT)vnew.p.ub);
1306       }
1307 
1308       if (!status) {
1309         kmp_info_t **other_threads = team->t.t_threads;
1310         int while_limit = pr->u.p.parm3;
1311         int while_index = 0;
1312         T id = pr->u.p.static_steal_counter; // loop id
1313         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1314                   __kmp_dispatch_num_buffers; // current loop index
1315         // note: victim thread can potentially execute another loop
1316         // TODO: algorithm of searching for a victim
1317         // should be cleaned up and measured
1318         while ((!status) && (while_limit != ++while_index)) {
1319           dispatch_private_info_template<T> *victim;
1320           union_i4 vold, vnew;
1321           kmp_int32 remaining;
1322           T victimIdx = pr->u.p.parm4;
1323           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1324           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1325               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1326           KMP_DEBUG_ASSERT(victim);
1327           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1328                  oldVictimIdx != victimIdx) {
1329             victimIdx = (victimIdx + 1) % nproc;
1330             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1331                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1332             KMP_DEBUG_ASSERT(victim);
1333           }
1334           if (victim == pr || id != victim->u.p.static_steal_counter) {
1335             continue; // try once more (nproc attempts in total)
1336             // no victim is ready yet to participate in stealing
1337             // because no victim passed kmp_init_dispatch yet
1338           }
1339           pr->u.p.parm4 = victimIdx; // new victim found
1340           while (1) { // CAS loop if victim has enough chunks to steal
1341             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1342             vnew = vold;
1343 
1344             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1345             if (vnew.p.count >= (UT)vnew.p.ub ||
1346                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1347               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1348               break; // not enough chunks to steal, goto next victim
1349             }
1350             if (remaining > 3) {
1351               vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1352             } else {
1353               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1354             }
1355             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1356             // TODO: Should this be acquire or release?
1357             if (KMP_COMPARE_AND_STORE_ACQ64(
1358                     (volatile kmp_int64 *)&victim->u.p.count,
1359                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1360                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1361               // stealing succeeded
1362               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1363                                         vold.p.ub - vnew.p.ub);
1364               status = 1;
1365               while_index = 0;
1366               // now update own count and ub
1367               init = vnew.p.ub;
1368               vold.p.count = init + 1;
1369 #if KMP_ARCH_X86
1370               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1371 #else
1372               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1373 #endif
1374               break;
1375             } // if (check CAS result)
1376             KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1377           } // while (try to steal from particular victim)
1378         } // while (search for victim)
1379       } // if (try to find victim and steal)
1380     } // if (4-byte induction variable)
1381     if (!status) {
1382       *p_lb = 0;
1383       *p_ub = 0;
1384       if (p_st != NULL)
1385         *p_st = 0;
1386     } else {
1387       start = pr->u.p.parm2;
1388       init *= chunk;
1389       limit = chunk + init - 1;
1390       incr = pr->u.p.st;
1391       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1392 
1393       KMP_DEBUG_ASSERT(init <= trip);
1394       if ((last = (limit >= trip)) != 0)
1395         limit = trip;
1396       if (p_st != NULL)
1397         *p_st = incr;
1398 
1399       if (incr == 1) {
1400         *p_lb = start + init;
1401         *p_ub = start + limit;
1402       } else {
1403         *p_lb = start + init * incr;
1404         *p_ub = start + limit * incr;
1405       }
1406 
1407       if (pr->flags.ordered) {
1408         pr->u.p.ordered_lower = init;
1409         pr->u.p.ordered_upper = limit;
1410       } // if
1411     } // if
1412     break;
1413   } // case
1414 #endif // ( KMP_STATIC_STEAL_ENABLED )
1415   case kmp_sch_static_balanced: {
1416     KD_TRACE(
1417         10,
1418         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1419          gtid));
1420     /* check if thread has any iteration to do */
1421     if ((status = !pr->u.p.count) != 0) {
1422       pr->u.p.count = 1;
1423       *p_lb = pr->u.p.lb;
1424       *p_ub = pr->u.p.ub;
1425       last = pr->u.p.parm1;
1426       if (p_st != NULL)
1427         *p_st = pr->u.p.st;
1428     } else { /* no iterations to do */
1429       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1430     }
1431   } // case
1432   break;
1433   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1434                                  merged here */
1435   case kmp_sch_static_chunked: {
1436     T parm1;
1437 
1438     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1439                    "kmp_sch_static_[affinity|chunked] case\n",
1440                    gtid));
1441     parm1 = pr->u.p.parm1;
1442 
1443     trip = pr->u.p.tc - 1;
1444     init = parm1 * (pr->u.p.count + tid);
1445 
1446     if ((status = (init <= trip)) != 0) {
1447       start = pr->u.p.lb;
1448       incr = pr->u.p.st;
1449       limit = parm1 + init - 1;
1450 
1451       if ((last = (limit >= trip)) != 0)
1452         limit = trip;
1453 
1454       if (p_st != NULL)
1455         *p_st = incr;
1456 
1457       pr->u.p.count += nproc;
1458 
1459       if (incr == 1) {
1460         *p_lb = start + init;
1461         *p_ub = start + limit;
1462       } else {
1463         *p_lb = start + init * incr;
1464         *p_ub = start + limit * incr;
1465       }
1466 
1467       if (pr->flags.ordered) {
1468         pr->u.p.ordered_lower = init;
1469         pr->u.p.ordered_upper = limit;
1470       } // if
1471     } // if
1472   } // case
1473   break;
1474 
1475   case kmp_sch_dynamic_chunked: {
1476     T chunk = pr->u.p.parm1;
1477 
1478     KD_TRACE(
1479         100,
1480         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1481          gtid));
1482 
1483     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1484     trip = pr->u.p.tc - 1;
1485 
1486     if ((status = (init <= trip)) == 0) {
1487       *p_lb = 0;
1488       *p_ub = 0;
1489       if (p_st != NULL)
1490         *p_st = 0;
1491     } else {
1492       start = pr->u.p.lb;
1493       limit = chunk + init - 1;
1494       incr = pr->u.p.st;
1495 
1496       if ((last = (limit >= trip)) != 0)
1497         limit = trip;
1498 
1499       if (p_st != NULL)
1500         *p_st = incr;
1501 
1502       if (incr == 1) {
1503         *p_lb = start + init;
1504         *p_ub = start + limit;
1505       } else {
1506         *p_lb = start + init * incr;
1507         *p_ub = start + limit * incr;
1508       }
1509 
1510       if (pr->flags.ordered) {
1511         pr->u.p.ordered_lower = init;
1512         pr->u.p.ordered_upper = limit;
1513       } // if
1514     } // if
1515   } // case
1516   break;
1517 
1518   case kmp_sch_guided_iterative_chunked: {
1519     T chunkspec = pr->u.p.parm1;
1520     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1521                    "iterative case\n",
1522                    gtid));
1523     trip = pr->u.p.tc;
1524     // Start atomic part of calculations
1525     while (1) {
1526       ST remaining; // signed, because can be < 0
1527       init = sh->u.s.iteration; // shared value
1528       remaining = trip - init;
1529       if (remaining <= 0) { // AC: need to compare with 0 first
1530         // nothing to do, don't try atomic op
1531         status = 0;
1532         break;
1533       }
1534       if ((T)remaining <
1535           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1536         // use dynamic-style schedule
1537         // atomically increment iterations, get old value
1538         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1539                                  (ST)chunkspec);
1540         remaining = trip - init;
1541         if (remaining <= 0) {
1542           status = 0; // all iterations got by other threads
1543         } else {
1544           // got some iterations to work on
1545           status = 1;
1546           if ((T)remaining > chunkspec) {
1547             limit = init + chunkspec - 1;
1548           } else {
1549             last = 1; // the last chunk
1550             limit = init + remaining - 1;
1551           } // if
1552         } // if
1553         break;
1554       } // if
1555       limit = init +
1556               (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1557       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1558                                (ST)init, (ST)limit)) {
1559         // CAS was successful, chunk obtained
1560         status = 1;
1561         --limit;
1562         break;
1563       } // if
1564     } // while
1565     if (status != 0) {
1566       start = pr->u.p.lb;
1567       incr = pr->u.p.st;
1568       if (p_st != NULL)
1569         *p_st = incr;
1570       *p_lb = start + init * incr;
1571       *p_ub = start + limit * incr;
1572       if (pr->flags.ordered) {
1573         pr->u.p.ordered_lower = init;
1574         pr->u.p.ordered_upper = limit;
1575       } // if
1576     } else {
1577       *p_lb = 0;
1578       *p_ub = 0;
1579       if (p_st != NULL)
1580         *p_st = 0;
1581     } // if
1582   } // case
1583   break;
1584 
1585   case kmp_sch_guided_simd: {
1586     // same as iterative but curr-chunk adjusted to be multiple of given
1587     // chunk
1588     T chunk = pr->u.p.parm1;
1589     KD_TRACE(100,
1590              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1591               gtid));
1592     trip = pr->u.p.tc;
1593     // Start atomic part of calculations
1594     while (1) {
1595       ST remaining; // signed, because can be < 0
1596       init = sh->u.s.iteration; // shared value
1597       remaining = trip - init;
1598       if (remaining <= 0) { // AC: need to compare with 0 first
1599         status = 0; // nothing to do, don't try atomic op
1600         break;
1601       }
1602       KMP_DEBUG_ASSERT(init % chunk == 0);
1603       // compare with K*nproc*(chunk+1), K=2 by default
1604       if ((T)remaining < pr->u.p.parm2) {
1605         // use dynamic-style schedule
1606         // atomically increment iterations, get old value
1607         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1608                                  (ST)chunk);
1609         remaining = trip - init;
1610         if (remaining <= 0) {
1611           status = 0; // all iterations got by other threads
1612         } else {
1613           // got some iterations to work on
1614           status = 1;
1615           if ((T)remaining > chunk) {
1616             limit = init + chunk - 1;
1617           } else {
1618             last = 1; // the last chunk
1619             limit = init + remaining - 1;
1620           } // if
1621         } // if
1622         break;
1623       } // if
1624       // divide by K*nproc
1625       UT span = remaining * (*(double *)&pr->u.p.parm3);
1626       UT rem = span % chunk;
1627       if (rem) // adjust so that span%chunk == 0
1628         span += chunk - rem;
1629       limit = init + span;
1630       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1631                                (ST)init, (ST)limit)) {
1632         // CAS was successful, chunk obtained
1633         status = 1;
1634         --limit;
1635         break;
1636       } // if
1637     } // while
1638     if (status != 0) {
1639       start = pr->u.p.lb;
1640       incr = pr->u.p.st;
1641       if (p_st != NULL)
1642         *p_st = incr;
1643       *p_lb = start + init * incr;
1644       *p_ub = start + limit * incr;
1645       if (pr->flags.ordered) {
1646         pr->u.p.ordered_lower = init;
1647         pr->u.p.ordered_upper = limit;
1648       } // if
1649     } else {
1650       *p_lb = 0;
1651       *p_ub = 0;
1652       if (p_st != NULL)
1653         *p_st = 0;
1654     } // if
1655   } // case
1656   break;
1657 
1658   case kmp_sch_guided_analytical_chunked: {
1659     T chunkspec = pr->u.p.parm1;
1660     UT chunkIdx;
1661 #if KMP_USE_X87CONTROL
1662     /* for storing original FPCW value for Windows* OS on
1663        IA-32 architecture 8-byte version */
1664     unsigned int oldFpcw;
1665     unsigned int fpcwSet = 0;
1666 #endif
1667     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1668                    "kmp_sch_guided_analytical_chunked case\n",
1669                    gtid));
1670 
1671     trip = pr->u.p.tc;
1672 
1673     KMP_DEBUG_ASSERT(nproc > 1);
1674     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1675 
1676     while (1) { /* this while loop is a safeguard against unexpected zero
1677                    chunk sizes */
1678       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1679       if (chunkIdx >= (UT)pr->u.p.parm2) {
1680         --trip;
1681         /* use dynamic-style scheduling */
1682         init = chunkIdx * chunkspec + pr->u.p.count;
1683         /* need to verify init > 0 in case of overflow in the above
1684          * calculation */
1685         if ((status = (init > 0 && init <= trip)) != 0) {
1686           limit = init + chunkspec - 1;
1687 
1688           if ((last = (limit >= trip)) != 0)
1689             limit = trip;
1690         }
1691         break;
1692       } else {
1693 /* use exponential-style scheduling */
1694 /* The following check is to workaround the lack of long double precision on
1695    Windows* OS.
1696    This check works around the possible effect that init != 0 for chunkIdx == 0.
1697  */
1698 #if KMP_USE_X87CONTROL
1699         /* If we haven't already done so, save original
1700            FPCW and set precision to 64-bit, as Windows* OS
1701            on IA-32 architecture defaults to 53-bit */
1702         if (!fpcwSet) {
1703           oldFpcw = _control87(0, 0);
1704           _control87(_PC_64, _MCW_PC);
1705           fpcwSet = 0x30000;
1706         }
1707 #endif
1708         if (chunkIdx) {
1709           init = __kmp_dispatch_guided_remaining<T>(
1710               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1711           KMP_DEBUG_ASSERT(init);
1712           init = trip - init;
1713         } else
1714           init = 0;
1715         limit = trip - __kmp_dispatch_guided_remaining<T>(
1716                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1717         KMP_ASSERT(init <= limit);
1718         if (init < limit) {
1719           KMP_DEBUG_ASSERT(limit <= trip);
1720           --limit;
1721           status = 1;
1722           break;
1723         } // if
1724       } // if
1725     } // while (1)
1726 #if KMP_USE_X87CONTROL
1727     /* restore FPCW if necessary
1728        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1729     */
1730     if (fpcwSet && (oldFpcw & fpcwSet))
1731       _control87(oldFpcw, _MCW_PC);
1732 #endif
1733     if (status != 0) {
1734       start = pr->u.p.lb;
1735       incr = pr->u.p.st;
1736       if (p_st != NULL)
1737         *p_st = incr;
1738       *p_lb = start + init * incr;
1739       *p_ub = start + limit * incr;
1740       if (pr->flags.ordered) {
1741         pr->u.p.ordered_lower = init;
1742         pr->u.p.ordered_upper = limit;
1743       }
1744     } else {
1745       *p_lb = 0;
1746       *p_ub = 0;
1747       if (p_st != NULL)
1748         *p_st = 0;
1749     }
1750   } // case
1751   break;
1752 
1753   case kmp_sch_trapezoidal: {
1754     UT index;
1755     T parm2 = pr->u.p.parm2;
1756     T parm3 = pr->u.p.parm3;
1757     T parm4 = pr->u.p.parm4;
1758     KD_TRACE(100,
1759              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1760               gtid));
1761 
1762     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1763 
1764     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1765     trip = pr->u.p.tc - 1;
1766 
1767     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1768       *p_lb = 0;
1769       *p_ub = 0;
1770       if (p_st != NULL)
1771         *p_st = 0;
1772     } else {
1773       start = pr->u.p.lb;
1774       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1775       incr = pr->u.p.st;
1776 
1777       if ((last = (limit >= trip)) != 0)
1778         limit = trip;
1779 
1780       if (p_st != NULL)
1781         *p_st = incr;
1782 
1783       if (incr == 1) {
1784         *p_lb = start + init;
1785         *p_ub = start + limit;
1786       } else {
1787         *p_lb = start + init * incr;
1788         *p_ub = start + limit * incr;
1789       }
1790 
1791       if (pr->flags.ordered) {
1792         pr->u.p.ordered_lower = init;
1793         pr->u.p.ordered_upper = limit;
1794       } // if
1795     } // if
1796   } // case
1797   break;
1798   default: {
1799     status = 0; // to avoid complaints on uninitialized variable use
1800     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1801                 KMP_HNT(GetNewerLibrary), // Hint
1802                 __kmp_msg_null // Variadic argument list terminator
1803                 );
1804   } break;
1805   } // switch
1806   if (p_last)
1807     *p_last = last;
1808 #ifdef KMP_DEBUG
1809   if (pr->flags.ordered) {
1810     char *buff;
1811     // create format specifiers before the debug output
1812     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1813                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1814                             traits_t<UT>::spec, traits_t<UT>::spec);
1815     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1816     __kmp_str_free(&buff);
1817   }
1818   {
1819     char *buff;
1820     // create format specifiers before the debug output
1821     buff = __kmp_str_format(
1822         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1823         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1824         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1825     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1826     __kmp_str_free(&buff);
1827   }
1828 #endif
1829   return status;
1830 }
1831 
1832 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1833    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1834    is not called. */
1835 #if OMPT_SUPPORT && OMPT_OPTIONAL
1836 #define OMPT_LOOP_END                                                          \
1837   if (status == 0) {                                                           \
1838     if (ompt_enabled.ompt_callback_work) {                                     \
1839       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1840       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1841       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1842           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1843           &(task_info->task_data), 0, codeptr);                                \
1844     }                                                                          \
1845   }
1846 // TODO: implement count
1847 #else
1848 #define OMPT_LOOP_END // no-op
1849 #endif
1850 
1851 #if KMP_STATS_ENABLED
1852 #define KMP_STATS_LOOP_END                                                     \
1853   {                                                                            \
1854     kmp_int64 u, l, t, i;                                                      \
1855     l = (kmp_int64)(*p_lb);                                                    \
1856     u = (kmp_int64)(*p_ub);                                                    \
1857     i = (kmp_int64)(pr->u.p.st);                                               \
1858     if (status == 0) {                                                         \
1859       t = 0;                                                                   \
1860       KMP_POP_PARTITIONED_TIMER();                                             \
1861     } else if (i == 1) {                                                       \
1862       if (u >= l)                                                              \
1863         t = u - l + 1;                                                         \
1864       else                                                                     \
1865         t = 0;                                                                 \
1866     } else if (i < 0) {                                                        \
1867       if (l >= u)                                                              \
1868         t = (l - u) / (-i) + 1;                                                \
1869       else                                                                     \
1870         t = 0;                                                                 \
1871     } else {                                                                   \
1872       if (u >= l)                                                              \
1873         t = (u - l) / i + 1;                                                   \
1874       else                                                                     \
1875         t = 0;                                                                 \
1876     }                                                                          \
1877     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1878   }
1879 #else
1880 #define KMP_STATS_LOOP_END /* Nothing */
1881 #endif
1882 
1883 template <typename T>
1884 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1885                                T *p_lb, T *p_ub,
1886                                typename traits_t<T>::signed_t *p_st
1887 #if OMPT_SUPPORT && OMPT_OPTIONAL
1888                                ,
1889                                void *codeptr
1890 #endif
1891                                ) {
1892 
1893   typedef typename traits_t<T>::unsigned_t UT;
1894   typedef typename traits_t<T>::signed_t ST;
1895   // This is potentially slightly misleading, schedule(runtime) will appear here
1896   // even if the actual runtime schedule is static. (Which points out a
1897   // disadvantage of schedule(runtime): even when static scheduling is used it
1898   // costs more than a compile time choice to use static scheduling would.)
1899   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1900 
1901   int status;
1902   dispatch_private_info_template<T> *pr;
1903   kmp_info_t *th = __kmp_threads[gtid];
1904   kmp_team_t *team = th->th.th_team;
1905 
1906   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1907   KD_TRACE(
1908       1000,
1909       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1910        gtid, p_lb, p_ub, p_st, p_last));
1911 
1912   if (team->t.t_serialized) {
1913     /* NOTE: serialize this dispatch because we are not at the active level */
1914     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1915         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1916     KMP_DEBUG_ASSERT(pr);
1917 
1918     if ((status = (pr->u.p.tc != 0)) == 0) {
1919       *p_lb = 0;
1920       *p_ub = 0;
1921       //            if ( p_last != NULL )
1922       //                *p_last = 0;
1923       if (p_st != NULL)
1924         *p_st = 0;
1925       if (__kmp_env_consistency_check) {
1926         if (pr->pushed_ws != ct_none) {
1927           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1928         }
1929       }
1930     } else if (pr->flags.nomerge) {
1931       kmp_int32 last;
1932       T start;
1933       UT limit, trip, init;
1934       ST incr;
1935       T chunk = pr->u.p.parm1;
1936 
1937       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1938                      gtid));
1939 
1940       init = chunk * pr->u.p.count++;
1941       trip = pr->u.p.tc - 1;
1942 
1943       if ((status = (init <= trip)) == 0) {
1944         *p_lb = 0;
1945         *p_ub = 0;
1946         //                if ( p_last != NULL )
1947         //                    *p_last = 0;
1948         if (p_st != NULL)
1949           *p_st = 0;
1950         if (__kmp_env_consistency_check) {
1951           if (pr->pushed_ws != ct_none) {
1952             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1953           }
1954         }
1955       } else {
1956         start = pr->u.p.lb;
1957         limit = chunk + init - 1;
1958         incr = pr->u.p.st;
1959 
1960         if ((last = (limit >= trip)) != 0) {
1961           limit = trip;
1962 #if KMP_OS_WINDOWS
1963           pr->u.p.last_upper = pr->u.p.ub;
1964 #endif /* KMP_OS_WINDOWS */
1965         }
1966         if (p_last != NULL)
1967           *p_last = last;
1968         if (p_st != NULL)
1969           *p_st = incr;
1970         if (incr == 1) {
1971           *p_lb = start + init;
1972           *p_ub = start + limit;
1973         } else {
1974           *p_lb = start + init * incr;
1975           *p_ub = start + limit * incr;
1976         }
1977 
1978         if (pr->flags.ordered) {
1979           pr->u.p.ordered_lower = init;
1980           pr->u.p.ordered_upper = limit;
1981 #ifdef KMP_DEBUG
1982           {
1983             char *buff;
1984             // create format specifiers before the debug output
1985             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1986                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1987                                     traits_t<UT>::spec, traits_t<UT>::spec);
1988             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1989                             pr->u.p.ordered_upper));
1990             __kmp_str_free(&buff);
1991           }
1992 #endif
1993         } // if
1994       } // if
1995     } else {
1996       pr->u.p.tc = 0;
1997       *p_lb = pr->u.p.lb;
1998       *p_ub = pr->u.p.ub;
1999 #if KMP_OS_WINDOWS
2000       pr->u.p.last_upper = *p_ub;
2001 #endif /* KMP_OS_WINDOWS */
2002       if (p_last != NULL)
2003         *p_last = TRUE;
2004       if (p_st != NULL)
2005         *p_st = pr->u.p.st;
2006     } // if
2007 #ifdef KMP_DEBUG
2008     {
2009       char *buff;
2010       // create format specifiers before the debug output
2011       buff = __kmp_str_format(
2012           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2013           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2014           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2015       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
2016       __kmp_str_free(&buff);
2017     }
2018 #endif
2019 #if INCLUDE_SSC_MARKS
2020     SSC_MARK_DISPATCH_NEXT();
2021 #endif
2022     OMPT_LOOP_END;
2023     KMP_STATS_LOOP_END;
2024     return status;
2025   } else {
2026     kmp_int32 last = 0;
2027     dispatch_shared_info_template<T> volatile *sh;
2028 
2029     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2030                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2031 
2032     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2033         th->th.th_dispatch->th_dispatch_pr_current);
2034     KMP_DEBUG_ASSERT(pr);
2035     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2036         th->th.th_dispatch->th_dispatch_sh_current);
2037     KMP_DEBUG_ASSERT(sh);
2038 
2039 #if KMP_USE_HIER_SCHED
2040     if (pr->flags.use_hier)
2041       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2042     else
2043 #endif // KMP_USE_HIER_SCHED
2044       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2045                                                 p_st, th->th.th_team_nproc,
2046                                                 th->th.th_info.ds.ds_tid);
2047     // status == 0: no more iterations to execute
2048     if (status == 0) {
2049       UT num_done;
2050 
2051       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2052 #ifdef KMP_DEBUG
2053       {
2054         char *buff;
2055         // create format specifiers before the debug output
2056         buff = __kmp_str_format(
2057             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2058             traits_t<UT>::spec);
2059         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2060         __kmp_str_free(&buff);
2061       }
2062 #endif
2063 
2064 #if KMP_USE_HIER_SCHED
2065       pr->flags.use_hier = FALSE;
2066 #endif
2067       if ((ST)num_done == th->th.th_team_nproc - 1) {
2068 #if (KMP_STATIC_STEAL_ENABLED)
2069         if (pr->schedule == kmp_sch_static_steal &&
2070             traits_t<T>::type_size > 4) {
2071           int i;
2072           int idx = (th->th.th_dispatch->th_disp_index - 1) %
2073                     __kmp_dispatch_num_buffers; // current loop index
2074           kmp_info_t **other_threads = team->t.t_threads;
2075           // loop complete, safe to destroy locks used for stealing
2076           for (i = 0; i < th->th.th_team_nproc; ++i) {
2077             dispatch_private_info_template<T> *buf =
2078                 reinterpret_cast<dispatch_private_info_template<T> *>(
2079                     &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
2080             kmp_lock_t *lck = buf->u.p.th_steal_lock;
2081             KMP_ASSERT(lck != NULL);
2082             __kmp_destroy_lock(lck);
2083             __kmp_free(lck);
2084             buf->u.p.th_steal_lock = NULL;
2085           }
2086         }
2087 #endif
2088         /* NOTE: release this buffer to be reused */
2089 
2090         KMP_MB(); /* Flush all pending memory write invalidates.  */
2091 
2092         sh->u.s.num_done = 0;
2093         sh->u.s.iteration = 0;
2094 
2095         /* TODO replace with general release procedure? */
2096         if (pr->flags.ordered) {
2097           sh->u.s.ordered_iteration = 0;
2098         }
2099 
2100         KMP_MB(); /* Flush all pending memory write invalidates.  */
2101 
2102         sh->buffer_index += __kmp_dispatch_num_buffers;
2103         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2104                        gtid, sh->buffer_index));
2105 
2106         KMP_MB(); /* Flush all pending memory write invalidates.  */
2107 
2108       } // if
2109       if (__kmp_env_consistency_check) {
2110         if (pr->pushed_ws != ct_none) {
2111           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2112         }
2113       }
2114 
2115       th->th.th_dispatch->th_deo_fcn = NULL;
2116       th->th.th_dispatch->th_dxo_fcn = NULL;
2117       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2118       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2119     } // if (status == 0)
2120 #if KMP_OS_WINDOWS
2121     else if (last) {
2122       pr->u.p.last_upper = pr->u.p.ub;
2123     }
2124 #endif /* KMP_OS_WINDOWS */
2125     if (p_last != NULL && status != 0)
2126       *p_last = last;
2127   } // if
2128 
2129 #ifdef KMP_DEBUG
2130   {
2131     char *buff;
2132     // create format specifiers before the debug output
2133     buff = __kmp_str_format(
2134         "__kmp_dispatch_next: T#%%d normal case: "
2135         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2136         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2137     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2138                   (p_last ? *p_last : 0), status));
2139     __kmp_str_free(&buff);
2140   }
2141 #endif
2142 #if INCLUDE_SSC_MARKS
2143   SSC_MARK_DISPATCH_NEXT();
2144 #endif
2145   OMPT_LOOP_END;
2146   KMP_STATS_LOOP_END;
2147   return status;
2148 }
2149 
2150 template <typename T>
2151 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2152                                   kmp_int32 *plastiter, T *plower, T *pupper,
2153                                   typename traits_t<T>::signed_t incr) {
2154   typedef typename traits_t<T>::unsigned_t UT;
2155   kmp_uint32 team_id;
2156   kmp_uint32 nteams;
2157   UT trip_count;
2158   kmp_team_t *team;
2159   kmp_info_t *th;
2160 
2161   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2162   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2163 #ifdef KMP_DEBUG
2164   typedef typename traits_t<T>::signed_t ST;
2165   {
2166     char *buff;
2167     // create format specifiers before the debug output
2168     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2169                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2170                             traits_t<T>::spec, traits_t<T>::spec,
2171                             traits_t<ST>::spec, traits_t<T>::spec);
2172     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2173     __kmp_str_free(&buff);
2174   }
2175 #endif
2176 
2177   if (__kmp_env_consistency_check) {
2178     if (incr == 0) {
2179       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2180                             loc);
2181     }
2182     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2183       // The loop is illegal.
2184       // Some zero-trip loops maintained by compiler, e.g.:
2185       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2186       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2187       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2188       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2189       // Compiler does not check the following illegal loops:
2190       //   for(i=0;i<10;i+=incr) // where incr<0
2191       //   for(i=10;i>0;i-=incr) // where incr<0
2192       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2193     }
2194   }
2195   th = __kmp_threads[gtid];
2196   team = th->th.th_team;
2197   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2198   nteams = th->th.th_teams_size.nteams;
2199   team_id = team->t.t_master_tid;
2200   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2201 
2202   // compute global trip count
2203   if (incr == 1) {
2204     trip_count = *pupper - *plower + 1;
2205   } else if (incr == -1) {
2206     trip_count = *plower - *pupper + 1;
2207   } else if (incr > 0) {
2208     // upper-lower can exceed the limit of signed type
2209     trip_count = (UT)(*pupper - *plower) / incr + 1;
2210   } else {
2211     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2212   }
2213 
2214   if (trip_count <= nteams) {
2215     KMP_DEBUG_ASSERT(
2216         __kmp_static == kmp_sch_static_greedy ||
2217         __kmp_static ==
2218             kmp_sch_static_balanced); // Unknown static scheduling type.
2219     // only some teams get single iteration, others get nothing
2220     if (team_id < trip_count) {
2221       *pupper = *plower = *plower + team_id * incr;
2222     } else {
2223       *plower = *pupper + incr; // zero-trip loop
2224     }
2225     if (plastiter != NULL)
2226       *plastiter = (team_id == trip_count - 1);
2227   } else {
2228     if (__kmp_static == kmp_sch_static_balanced) {
2229       UT chunk = trip_count / nteams;
2230       UT extras = trip_count % nteams;
2231       *plower +=
2232           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2233       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2234       if (plastiter != NULL)
2235         *plastiter = (team_id == nteams - 1);
2236     } else {
2237       T chunk_inc_count =
2238           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2239       T upper = *pupper;
2240       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2241       // Unknown static scheduling type.
2242       *plower += team_id * chunk_inc_count;
2243       *pupper = *plower + chunk_inc_count - incr;
2244       // Check/correct bounds if needed
2245       if (incr > 0) {
2246         if (*pupper < *plower)
2247           *pupper = traits_t<T>::max_value;
2248         if (plastiter != NULL)
2249           *plastiter = *plower <= upper && *pupper > upper - incr;
2250         if (*pupper > upper)
2251           *pupper = upper; // tracker C73258
2252       } else {
2253         if (*pupper > *plower)
2254           *pupper = traits_t<T>::min_value;
2255         if (plastiter != NULL)
2256           *plastiter = *plower >= upper && *pupper < upper - incr;
2257         if (*pupper < upper)
2258           *pupper = upper; // tracker C73258
2259       }
2260     }
2261   }
2262 }
2263 
2264 //-----------------------------------------------------------------------------
2265 // Dispatch routines
2266 //    Transfer call to template< type T >
2267 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2268 //                         T lb, T ub, ST st, ST chunk )
2269 extern "C" {
2270 
2271 /*!
2272 @ingroup WORK_SHARING
2273 @{
2274 @param loc Source location
2275 @param gtid Global thread id
2276 @param schedule Schedule type
2277 @param lb  Lower bound
2278 @param ub  Upper bound
2279 @param st  Step (or increment if you prefer)
2280 @param chunk The chunk size to block with
2281 
2282 This function prepares the runtime to start a dynamically scheduled for loop,
2283 saving the loop arguments.
2284 These functions are all identical apart from the types of the arguments.
2285 */
2286 
2287 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2288                             enum sched_type schedule, kmp_int32 lb,
2289                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2290   KMP_DEBUG_ASSERT(__kmp_init_serial);
2291 #if OMPT_SUPPORT && OMPT_OPTIONAL
2292   OMPT_STORE_RETURN_ADDRESS(gtid);
2293 #endif
2294   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2295 }
2296 /*!
2297 See @ref __kmpc_dispatch_init_4
2298 */
2299 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2300                              enum sched_type schedule, kmp_uint32 lb,
2301                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2302   KMP_DEBUG_ASSERT(__kmp_init_serial);
2303 #if OMPT_SUPPORT && OMPT_OPTIONAL
2304   OMPT_STORE_RETURN_ADDRESS(gtid);
2305 #endif
2306   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2307 }
2308 
2309 /*!
2310 See @ref __kmpc_dispatch_init_4
2311 */
2312 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2313                             enum sched_type schedule, kmp_int64 lb,
2314                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2315   KMP_DEBUG_ASSERT(__kmp_init_serial);
2316 #if OMPT_SUPPORT && OMPT_OPTIONAL
2317   OMPT_STORE_RETURN_ADDRESS(gtid);
2318 #endif
2319   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2320 }
2321 
2322 /*!
2323 See @ref __kmpc_dispatch_init_4
2324 */
2325 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2326                              enum sched_type schedule, kmp_uint64 lb,
2327                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2328   KMP_DEBUG_ASSERT(__kmp_init_serial);
2329 #if OMPT_SUPPORT && OMPT_OPTIONAL
2330   OMPT_STORE_RETURN_ADDRESS(gtid);
2331 #endif
2332   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2333 }
2334 
2335 /*!
2336 See @ref __kmpc_dispatch_init_4
2337 
2338 Difference from __kmpc_dispatch_init set of functions is these functions
2339 are called for composite distribute parallel for construct. Thus before
2340 regular iterations dispatching we need to calc per-team iteration space.
2341 
2342 These functions are all identical apart from the types of the arguments.
2343 */
2344 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2345                                  enum sched_type schedule, kmp_int32 *p_last,
2346                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2347                                  kmp_int32 chunk) {
2348   KMP_DEBUG_ASSERT(__kmp_init_serial);
2349 #if OMPT_SUPPORT && OMPT_OPTIONAL
2350   OMPT_STORE_RETURN_ADDRESS(gtid);
2351 #endif
2352   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2353   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2354 }
2355 
2356 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2357                                   enum sched_type schedule, kmp_int32 *p_last,
2358                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2359                                   kmp_int32 chunk) {
2360   KMP_DEBUG_ASSERT(__kmp_init_serial);
2361 #if OMPT_SUPPORT && OMPT_OPTIONAL
2362   OMPT_STORE_RETURN_ADDRESS(gtid);
2363 #endif
2364   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2365   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2366 }
2367 
2368 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2369                                  enum sched_type schedule, kmp_int32 *p_last,
2370                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2371                                  kmp_int64 chunk) {
2372   KMP_DEBUG_ASSERT(__kmp_init_serial);
2373 #if OMPT_SUPPORT && OMPT_OPTIONAL
2374   OMPT_STORE_RETURN_ADDRESS(gtid);
2375 #endif
2376   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2377   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2378 }
2379 
2380 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2381                                   enum sched_type schedule, kmp_int32 *p_last,
2382                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2383                                   kmp_int64 chunk) {
2384   KMP_DEBUG_ASSERT(__kmp_init_serial);
2385 #if OMPT_SUPPORT && OMPT_OPTIONAL
2386   OMPT_STORE_RETURN_ADDRESS(gtid);
2387 #endif
2388   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2389   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2390 }
2391 
2392 /*!
2393 @param loc Source code location
2394 @param gtid Global thread id
2395 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2396 otherwise
2397 @param p_lb   Pointer to the lower bound for the next chunk of work
2398 @param p_ub   Pointer to the upper bound for the next chunk of work
2399 @param p_st   Pointer to the stride for the next chunk of work
2400 @return one if there is work to be done, zero otherwise
2401 
2402 Get the next dynamically allocated chunk of work for this thread.
2403 If there is no more work, then the lb,ub and stride need not be modified.
2404 */
2405 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2406                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2407 #if OMPT_SUPPORT && OMPT_OPTIONAL
2408   OMPT_STORE_RETURN_ADDRESS(gtid);
2409 #endif
2410   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2411 #if OMPT_SUPPORT && OMPT_OPTIONAL
2412                                         ,
2413                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2414 #endif
2415                                             );
2416 }
2417 
2418 /*!
2419 See @ref __kmpc_dispatch_next_4
2420 */
2421 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2422                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2423                             kmp_int32 *p_st) {
2424 #if OMPT_SUPPORT && OMPT_OPTIONAL
2425   OMPT_STORE_RETURN_ADDRESS(gtid);
2426 #endif
2427   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2428 #if OMPT_SUPPORT && OMPT_OPTIONAL
2429                                          ,
2430                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2431 #endif
2432                                              );
2433 }
2434 
2435 /*!
2436 See @ref __kmpc_dispatch_next_4
2437 */
2438 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2439                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2440 #if OMPT_SUPPORT && OMPT_OPTIONAL
2441   OMPT_STORE_RETURN_ADDRESS(gtid);
2442 #endif
2443   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2444 #if OMPT_SUPPORT && OMPT_OPTIONAL
2445                                         ,
2446                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2447 #endif
2448                                             );
2449 }
2450 
2451 /*!
2452 See @ref __kmpc_dispatch_next_4
2453 */
2454 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2455                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2456                             kmp_int64 *p_st) {
2457 #if OMPT_SUPPORT && OMPT_OPTIONAL
2458   OMPT_STORE_RETURN_ADDRESS(gtid);
2459 #endif
2460   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2461 #if OMPT_SUPPORT && OMPT_OPTIONAL
2462                                          ,
2463                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2464 #endif
2465                                              );
2466 }
2467 
2468 /*!
2469 @param loc Source code location
2470 @param gtid Global thread id
2471 
2472 Mark the end of a dynamic loop.
2473 */
2474 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2475   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2476 }
2477 
2478 /*!
2479 See @ref __kmpc_dispatch_fini_4
2480 */
2481 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2482   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2483 }
2484 
2485 /*!
2486 See @ref __kmpc_dispatch_fini_4
2487 */
2488 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2489   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2490 }
2491 
2492 /*!
2493 See @ref __kmpc_dispatch_fini_4
2494 */
2495 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2496   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2497 }
2498 /*! @} */
2499 
2500 //-----------------------------------------------------------------------------
2501 // Non-template routines from kmp_dispatch.cpp used in other sources
2502 
2503 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2504   return value == checker;
2505 }
2506 
2507 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2508   return value != checker;
2509 }
2510 
2511 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2512   return value < checker;
2513 }
2514 
2515 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2516   return value >= checker;
2517 }
2518 
2519 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2520   return value <= checker;
2521 }
2522 
2523 kmp_uint32
2524 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2525              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2526              void *obj // Higher-level synchronization object, or NULL.
2527              ) {
2528   // note: we may not belong to a team at this point
2529   volatile kmp_uint32 *spin = spinner;
2530   kmp_uint32 check = checker;
2531   kmp_uint32 spins;
2532   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2533   kmp_uint32 r;
2534 
2535   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2536   KMP_INIT_YIELD(spins);
2537   // main wait spin loop
2538   while (!f(r = TCR_4(*spin), check)) {
2539     KMP_FSYNC_SPIN_PREPARE(obj);
2540     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2541        split. It causes problems with infinite recursion because of exit lock */
2542     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2543         __kmp_abort_thread(); */
2544     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2545   }
2546   KMP_FSYNC_SPIN_ACQUIRED(obj);
2547   return r;
2548 }
2549 
2550 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2551                       kmp_uint32 (*pred)(void *, kmp_uint32),
2552                       void *obj // Higher-level synchronization object, or NULL.
2553                       ) {
2554   // note: we may not belong to a team at this point
2555   void *spin = spinner;
2556   kmp_uint32 check = checker;
2557   kmp_uint32 spins;
2558   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2559 
2560   KMP_FSYNC_SPIN_INIT(obj, spin);
2561   KMP_INIT_YIELD(spins);
2562   // main wait spin loop
2563   while (!f(spin, check)) {
2564     KMP_FSYNC_SPIN_PREPARE(obj);
2565     /* if we have waited a bit, or are noversubscribed, yield */
2566     /* pause is in the following code */
2567     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2568   }
2569   KMP_FSYNC_SPIN_ACQUIRED(obj);
2570 }
2571 
2572 } // extern "C"
2573 
2574 #ifdef KMP_GOMP_COMPAT
2575 
2576 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2577                                enum sched_type schedule, kmp_int32 lb,
2578                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2579                                int push_ws) {
2580   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2581                                  push_ws);
2582 }
2583 
2584 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2585                                 enum sched_type schedule, kmp_uint32 lb,
2586                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2587                                 int push_ws) {
2588   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2589                                   push_ws);
2590 }
2591 
2592 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2593                                enum sched_type schedule, kmp_int64 lb,
2594                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2595                                int push_ws) {
2596   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2597                                  push_ws);
2598 }
2599 
2600 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2601                                 enum sched_type schedule, kmp_uint64 lb,
2602                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2603                                 int push_ws) {
2604   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2605                                   push_ws);
2606 }
2607 
2608 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2609   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2610 }
2611 
2612 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2613   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2614 }
2615 
2616 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2617   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2618 }
2619 
2620 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2621   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2622 }
2623 
2624 #endif /* KMP_GOMP_COMPAT */
2625 
2626 /* ------------------------------------------------------------------------ */
2627