xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_dispatch.cpp (revision 357378bbdedf24ce2b90e9bd831af4a9db3ec70a)
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73                                          bool use_hier = false) {
74   // Pick up the nonmonotonic/monotonic bits from the scheduling type
75   // Nonmonotonic as default for dynamic schedule when no modifier is specified
76   int monotonicity = SCHEDULE_NONMONOTONIC;
77 
78   // Let default be monotonic for executables
79   // compiled with OpenMP* 4.5 or less compilers
80   if (loc != NULL && loc->get_openmp_version() < 50)
81     monotonicity = SCHEDULE_MONOTONIC;
82 
83   if (use_hier || __kmp_force_monotonic)
84     monotonicity = SCHEDULE_MONOTONIC;
85   else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86     monotonicity = SCHEDULE_NONMONOTONIC;
87   else if (SCHEDULE_HAS_MONOTONIC(schedule))
88     monotonicity = SCHEDULE_MONOTONIC;
89 
90   return monotonicity;
91 }
92 
93 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
94 // Return floating point number rounded to two decimal points
95 static inline float __kmp_round_2decimal_val(float num) {
96   return (float)(static_cast<int>(num * 100 + 0.5)) / 100;
97 }
98 static inline int __kmp_get_round_val(float num) {
99   return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);
100 }
101 #endif
102 
103 template <typename T>
104 inline void
105 __kmp_initialize_self_buffer(kmp_team_t *team, T id,
106                              dispatch_private_info_template<T> *pr,
107                              typename traits_t<T>::unsigned_t nchunks, T nproc,
108                              typename traits_t<T>::unsigned_t &init,
109                              T &small_chunk, T &extras, T &p_extra) {
110 
111 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
112   if (pr->flags.use_hybrid) {
113     kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];
114     kmp_hw_core_type_t type =
115         (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
116     T pchunks = pr->u.p.pchunks;
117     T echunks = nchunks - pchunks;
118     T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
119     T num_procs_with_ecore = nproc - num_procs_with_pcore;
120     T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
121     T big_chunk =
122         pchunks / num_procs_with_pcore; // chunks per thread with p-core
123     small_chunk =
124         echunks / num_procs_with_ecore; // chunks per thread with e-core
125 
126     extras =
127         (pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);
128 
129     p_extra = (big_chunk - small_chunk);
130 
131     if (type == KMP_HW_CORE_TYPE_CORE) {
132       if (id < first_thread_with_ecore) {
133         init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
134       } else {
135         init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
136                (id < extras ? id : extras);
137       }
138     } else {
139       if (id == first_thread_with_ecore) {
140         init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
141       } else {
142         init = id * small_chunk + first_thread_with_ecore * p_extra +
143                (id < extras ? id : extras);
144       }
145     }
146     p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
147     return;
148   }
149 #endif
150 
151   small_chunk = nchunks / nproc; // chunks per thread
152   extras = nchunks % nproc;
153   p_extra = 0;
154   init = id * small_chunk + (id < extras ? id : extras);
155 }
156 
157 #if KMP_STATIC_STEAL_ENABLED
158 enum { // values for steal_flag (possible states of private per-loop buffer)
159   UNUSED = 0,
160   CLAIMED = 1, // owner thread started initialization
161   READY = 2, // available for stealing
162   THIEF = 3 // finished by owner, or claimed by thief
163   // possible state changes:
164   // 0 -> 1 owner only, sync
165   // 0 -> 3 thief only, sync
166   // 1 -> 2 owner only, async
167   // 2 -> 3 owner only, async
168   // 3 -> 2 owner only, async
169   // 3 -> 0 last thread finishing the loop, async
170 };
171 #endif
172 
173 // Initialize a dispatch_private_info_template<T> buffer for a particular
174 // type of schedule,chunk.  The loop description is found in lb (lower bound),
175 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
176 // to the scheduling (often the number of threads in a team, but not always if
177 // hierarchical scheduling is used).  tid is the id of the thread calling
178 // the function within the group of nproc threads.  It will have a value
179 // between 0 and nproc - 1.  This is often just the thread id within a team, but
180 // is not necessarily the case when using hierarchical scheduling.
181 // loc is the source file location of the corresponding loop
182 // gtid is the global thread id
183 template <typename T>
184 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
185                                    dispatch_private_info_template<T> *pr,
186                                    enum sched_type schedule, T lb, T ub,
187                                    typename traits_t<T>::signed_t st,
188 #if USE_ITT_BUILD
189                                    kmp_uint64 *cur_chunk,
190 #endif
191                                    typename traits_t<T>::signed_t chunk,
192                                    T nproc, T tid) {
193   typedef typename traits_t<T>::unsigned_t UT;
194   typedef typename traits_t<T>::floating_t DBL;
195 
196   int active;
197   T tc;
198   kmp_info_t *th;
199   kmp_team_t *team;
200   int monotonicity;
201   bool use_hier;
202 
203 #ifdef KMP_DEBUG
204   typedef typename traits_t<T>::signed_t ST;
205   {
206     char *buff;
207     // create format specifiers before the debug output
208     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
209                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
210                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
211                             traits_t<T>::spec, traits_t<T>::spec,
212                             traits_t<ST>::spec, traits_t<ST>::spec,
213                             traits_t<T>::spec, traits_t<T>::spec);
214     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
215     __kmp_str_free(&buff);
216   }
217 #endif
218   /* setup data */
219   th = __kmp_threads[gtid];
220   team = th->th.th_team;
221   active = !team->t.t_serialized;
222 
223 #if USE_ITT_BUILD
224   int itt_need_metadata_reporting =
225       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
226       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
227       team->t.t_active_level == 1;
228 #endif
229 
230 #if KMP_USE_HIER_SCHED
231   use_hier = pr->flags.use_hier;
232 #else
233   use_hier = false;
234 #endif
235 
236   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
237   monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
238   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
239 
240   /* Pick up the nomerge/ordered bits from the scheduling type */
241   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
242     pr->flags.nomerge = TRUE;
243     schedule =
244         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
245   } else {
246     pr->flags.nomerge = FALSE;
247   }
248   pr->type_size = traits_t<T>::type_size; // remember the size of variables
249   if (kmp_ord_lower & schedule) {
250     pr->flags.ordered = TRUE;
251     schedule =
252         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
253   } else {
254     pr->flags.ordered = FALSE;
255   }
256   // Ordered overrides nonmonotonic
257   if (pr->flags.ordered) {
258     monotonicity = SCHEDULE_MONOTONIC;
259   }
260 
261   if (schedule == kmp_sch_static) {
262     schedule = __kmp_static;
263   } else {
264     if (schedule == kmp_sch_runtime) {
265       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
266       // not specified)
267       schedule = team->t.t_sched.r_sched_type;
268       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
269       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
270       if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
271         monotonicity = SCHEDULE_MONOTONIC;
272       // Detail the schedule if needed (global controls are differentiated
273       // appropriately)
274       if (schedule == kmp_sch_guided_chunked) {
275         schedule = __kmp_guided;
276       } else if (schedule == kmp_sch_static) {
277         schedule = __kmp_static;
278       }
279       // Use the chunk size specified by OMP_SCHEDULE (or default if not
280       // specified)
281       chunk = team->t.t_sched.chunk;
282 #if USE_ITT_BUILD
283       if (cur_chunk)
284         *cur_chunk = chunk;
285 #endif
286 #ifdef KMP_DEBUG
287       {
288         char *buff;
289         // create format specifiers before the debug output
290         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
291                                 "schedule:%%d chunk:%%%s\n",
292                                 traits_t<ST>::spec);
293         KD_TRACE(10, (buff, gtid, schedule, chunk));
294         __kmp_str_free(&buff);
295       }
296 #endif
297     } else {
298       if (schedule == kmp_sch_guided_chunked) {
299         schedule = __kmp_guided;
300       }
301       if (chunk <= 0) {
302         chunk = KMP_DEFAULT_CHUNK;
303       }
304     }
305 
306     if (schedule == kmp_sch_auto) {
307       // mapping and differentiation: in the __kmp_do_serial_initialize()
308       schedule = __kmp_auto;
309 #ifdef KMP_DEBUG
310       {
311         char *buff;
312         // create format specifiers before the debug output
313         buff = __kmp_str_format(
314             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
315             "schedule:%%d chunk:%%%s\n",
316             traits_t<ST>::spec);
317         KD_TRACE(10, (buff, gtid, schedule, chunk));
318         __kmp_str_free(&buff);
319       }
320 #endif
321     }
322 #if KMP_STATIC_STEAL_ENABLED
323     // map nonmonotonic:dynamic to static steal
324     if (schedule == kmp_sch_dynamic_chunked) {
325       if (monotonicity == SCHEDULE_NONMONOTONIC)
326         schedule = kmp_sch_static_steal;
327     }
328 #endif
329     /* guided analytical not safe for too many threads */
330     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
331       schedule = kmp_sch_guided_iterative_chunked;
332       KMP_WARNING(DispatchManyThreads);
333     }
334     if (schedule == kmp_sch_runtime_simd) {
335       // compiler provides simd_width in the chunk parameter
336       schedule = team->t.t_sched.r_sched_type;
337       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
338       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
339       // Detail the schedule if needed (global controls are differentiated
340       // appropriately)
341       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
342           schedule == __kmp_static) {
343         schedule = kmp_sch_static_balanced_chunked;
344       } else {
345         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
346           schedule = kmp_sch_guided_simd;
347         }
348         chunk = team->t.t_sched.chunk * chunk;
349       }
350 #if USE_ITT_BUILD
351       if (cur_chunk)
352         *cur_chunk = chunk;
353 #endif
354 #ifdef KMP_DEBUG
355       {
356         char *buff;
357         // create format specifiers before the debug output
358         buff = __kmp_str_format(
359             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
360             " chunk:%%%s\n",
361             traits_t<ST>::spec);
362         KD_TRACE(10, (buff, gtid, schedule, chunk));
363         __kmp_str_free(&buff);
364       }
365 #endif
366     }
367     pr->u.p.parm1 = chunk;
368   }
369   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
370               "unknown scheduling type");
371 
372   pr->u.p.count = 0;
373 
374   if (__kmp_env_consistency_check) {
375     if (st == 0) {
376       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
377                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
378     }
379   }
380   // compute trip count
381   if (st == 1) { // most common case
382     if (ub >= lb) {
383       tc = ub - lb + 1;
384     } else { // ub < lb
385       tc = 0; // zero-trip
386     }
387   } else if (st < 0) {
388     if (lb >= ub) {
389       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
390       // where the division needs to be unsigned regardless of the result type
391       tc = (UT)(lb - ub) / (-st) + 1;
392     } else { // lb < ub
393       tc = 0; // zero-trip
394     }
395   } else { // st > 0
396     if (ub >= lb) {
397       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
398       // where the division needs to be unsigned regardless of the result type
399       tc = (UT)(ub - lb) / st + 1;
400     } else { // ub < lb
401       tc = 0; // zero-trip
402     }
403   }
404 
405 #if KMP_STATS_ENABLED
406   if (KMP_MASTER_GTID(gtid)) {
407     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
408   }
409 #endif
410 
411   pr->u.p.lb = lb;
412   pr->u.p.ub = ub;
413   pr->u.p.st = st;
414   pr->u.p.tc = tc;
415 
416 #if KMP_OS_WINDOWS
417   pr->u.p.last_upper = ub + st;
418 #endif /* KMP_OS_WINDOWS */
419 
420   /* NOTE: only the active parallel region(s) has active ordered sections */
421 
422   if (active) {
423     if (pr->flags.ordered) {
424       pr->ordered_bumped = 0;
425       pr->u.p.ordered_lower = 1;
426       pr->u.p.ordered_upper = 0;
427     }
428   }
429 
430   switch (schedule) {
431 #if KMP_STATIC_STEAL_ENABLED
432   case kmp_sch_static_steal: {
433     T ntc, init = 0;
434 
435     KD_TRACE(100,
436              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
437               gtid));
438 
439     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
440     if (nproc > 1 && ntc >= nproc) {
441       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
442       T id = tid;
443       T small_chunk, extras, p_extra = 0;
444       kmp_uint32 old = UNUSED;
445       int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
446       if (traits_t<T>::type_size > 4) {
447         // AC: TODO: check if 16-byte CAS available and use it to
448         // improve performance (probably wait for explicit request
449         // before spending time on this).
450         // For now use dynamically allocated per-private-buffer lock,
451         // free memory in __kmp_dispatch_next when status==0.
452         pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
453         __kmp_init_lock(pr->u.p.steal_lock);
454       }
455 
456 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
457       // Iterations are divided in a 60/40 skewed distribution among CORE and
458       // ATOM processors for hybrid systems
459       bool use_hybrid = false;
460       kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;
461       T first_thread_with_ecore = 0;
462       T num_procs_with_pcore = 0;
463       T num_procs_with_ecore = 0;
464       T p_ntc = 0, e_ntc = 0;
465       if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
466           __kmp_affinity.type != affinity_explicit) {
467         use_hybrid = true;
468         core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
469         if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
470             __kmp_first_osid_with_ecore > -1) {
471           for (int i = 0; i < team->t.t_nproc; ++i) {
472             kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
473                                           ->th.th_topology_attrs.core_type;
474             int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
475             if (id == __kmp_first_osid_with_ecore) {
476               first_thread_with_ecore =
477                   team->t.t_threads[i]->th.th_info.ds.ds_tid;
478             }
479             if (type == KMP_HW_CORE_TYPE_CORE) {
480               num_procs_with_pcore++;
481             } else if (type == KMP_HW_CORE_TYPE_ATOM) {
482               num_procs_with_ecore++;
483             } else {
484               use_hybrid = false;
485               break;
486             }
487           }
488         }
489         if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {
490           float multiplier = 60.0 / 40.0;
491           float p_ratio = (float)num_procs_with_pcore / nproc;
492           float e_ratio = (float)num_procs_with_ecore / nproc;
493           float e_multiplier =
494               (float)1 /
495               (((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
496           float p_multiplier = multiplier * e_multiplier;
497           p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
498           if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))
499             e_ntc =
500                 (int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));
501           else
502             e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
503           KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);
504 
505           // Use regular static steal if not enough chunks for skewed
506           // distribution
507           use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
508                                        e_ntc >= num_procs_with_ecore)
509                             ? true
510                             : false);
511         } else {
512           use_hybrid = false;
513         }
514       }
515       pr->flags.use_hybrid = use_hybrid;
516       pr->u.p.pchunks = p_ntc;
517       pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
518       pr->u.p.first_thread_with_ecore = first_thread_with_ecore;
519 
520       if (use_hybrid) {
521         KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
522         T big_chunk = p_ntc / num_procs_with_pcore;
523         small_chunk = e_ntc / num_procs_with_ecore;
524 
525         extras =
526             (p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);
527 
528         p_extra = (big_chunk - small_chunk);
529 
530         if (core_type == KMP_HW_CORE_TYPE_CORE) {
531           if (id < first_thread_with_ecore) {
532             init =
533                 id * small_chunk + id * p_extra + (id < extras ? id : extras);
534           } else {
535             init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
536                    (id < extras ? id : extras);
537           }
538         } else {
539           if (id == first_thread_with_ecore) {
540             init =
541                 id * small_chunk + id * p_extra + (id < extras ? id : extras);
542           } else {
543             init = id * small_chunk + first_thread_with_ecore * p_extra +
544                    (id < extras ? id : extras);
545           }
546         }
547         p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
548       } else
549 #endif
550       {
551         small_chunk = ntc / nproc;
552         extras = ntc % nproc;
553         init = id * small_chunk + (id < extras ? id : extras);
554         p_extra = 0;
555       }
556       pr->u.p.count = init;
557       if (claimed) { // are we succeeded in claiming own buffer?
558         pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
559         // Other threads will inspect steal_flag when searching for a victim.
560         // READY means other threads may steal from this thread from now on.
561         KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
562       } else {
563         // other thread has stolen whole our range
564         KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
565         pr->u.p.ub = init; // mark there is no iterations to work on
566       }
567       pr->u.p.parm2 = ntc; // save number of chunks
568       // parm3 is the number of times to attempt stealing which is
569       // nproc (just a heuristics, could be optimized later on).
570       pr->u.p.parm3 = nproc;
571       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
572       break;
573     } else {
574       /* too few chunks: switching to kmp_sch_dynamic_chunked */
575       schedule = kmp_sch_dynamic_chunked;
576       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
577                      "kmp_sch_dynamic_chunked\n",
578                      gtid));
579       goto dynamic_init;
580       break;
581     } // if
582   } // case
583 #endif
584   case kmp_sch_static_balanced: {
585     T init, limit;
586 
587     KD_TRACE(
588         100,
589         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
590          gtid));
591 
592     if (nproc > 1) {
593       T id = tid;
594 
595       if (tc < nproc) {
596         if (id < tc) {
597           init = id;
598           limit = id;
599           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
600         } else {
601           pr->u.p.count = 1; /* means no more chunks to execute */
602           pr->u.p.parm1 = FALSE;
603           break;
604         }
605       } else {
606         T small_chunk = tc / nproc;
607         T extras = tc % nproc;
608         init = id * small_chunk + (id < extras ? id : extras);
609         limit = init + small_chunk - (id < extras ? 0 : 1);
610         pr->u.p.parm1 = (id == nproc - 1);
611       }
612     } else {
613       if (tc > 0) {
614         init = 0;
615         limit = tc - 1;
616         pr->u.p.parm1 = TRUE;
617       } else {
618         // zero trip count
619         pr->u.p.count = 1; /* means no more chunks to execute */
620         pr->u.p.parm1 = FALSE;
621         break;
622       }
623     }
624 #if USE_ITT_BUILD
625     // Calculate chunk for metadata report
626     if (itt_need_metadata_reporting)
627       if (cur_chunk)
628         *cur_chunk = limit - init + 1;
629 #endif
630     if (st == 1) {
631       pr->u.p.lb = lb + init;
632       pr->u.p.ub = lb + limit;
633     } else {
634       // calculated upper bound, "ub" is user-defined upper bound
635       T ub_tmp = lb + limit * st;
636       pr->u.p.lb = lb + init * st;
637       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
638       // it exactly
639       if (st > 0) {
640         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
641       } else {
642         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
643       }
644     }
645     if (pr->flags.ordered) {
646       pr->u.p.ordered_lower = init;
647       pr->u.p.ordered_upper = limit;
648     }
649     break;
650   } // case
651   case kmp_sch_static_balanced_chunked: {
652     // similar to balanced, but chunk adjusted to multiple of simd width
653     T nth = nproc;
654     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
655                    " -> falling-through to static_greedy\n",
656                    gtid));
657     schedule = kmp_sch_static_greedy;
658     if (nth > 1)
659       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
660     else
661       pr->u.p.parm1 = tc;
662     break;
663   } // case
664   case kmp_sch_guided_simd:
665   case kmp_sch_guided_iterative_chunked: {
666     KD_TRACE(
667         100,
668         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
669          " case\n",
670          gtid));
671 
672     if (nproc > 1) {
673       if ((2L * chunk + 1) * nproc >= tc) {
674         /* chunk size too large, switch to dynamic */
675         schedule = kmp_sch_dynamic_chunked;
676         goto dynamic_init;
677       } else {
678         // when remaining iters become less than parm2 - switch to dynamic
679         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
680         *(double *)&pr->u.p.parm3 =
681             guided_flt_param / (double)nproc; // may occupy parm3 and parm4
682       }
683     } else {
684       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
685                      "kmp_sch_static_greedy\n",
686                      gtid));
687       schedule = kmp_sch_static_greedy;
688       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
689       KD_TRACE(
690           100,
691           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
692            gtid));
693       pr->u.p.parm1 = tc;
694     } // if
695   } // case
696   break;
697   case kmp_sch_guided_analytical_chunked: {
698     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
699                    "kmp_sch_guided_analytical_chunked case\n",
700                    gtid));
701 
702     if (nproc > 1) {
703       if ((2L * chunk + 1) * nproc >= tc) {
704         /* chunk size too large, switch to dynamic */
705         schedule = kmp_sch_dynamic_chunked;
706         goto dynamic_init;
707       } else {
708         /* commonly used term: (2 nproc - 1)/(2 nproc) */
709         DBL x;
710 
711 #if KMP_USE_X87CONTROL
712         /* Linux* OS already has 64-bit computation by default for long double,
713            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
714            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
715            instead of the default 53-bit. Even though long double doesn't work
716            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
717            expected to impact the correctness of the algorithm, but this has not
718            been mathematically proven. */
719         // save original FPCW and set precision to 64-bit, as
720         // Windows* OS on IA-32 architecture defaults to 53-bit
721         unsigned int oldFpcw = _control87(0, 0);
722         _control87(_PC_64, _MCW_PC); // 0,0x30000
723 #endif
724         /* value used for comparison in solver for cross-over point */
725         KMP_ASSERT(tc > 0);
726         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
727 
728         /* crossover point--chunk indexes equal to or greater than
729            this point switch to dynamic-style scheduling */
730         UT cross;
731 
732         /* commonly used term: (2 nproc - 1)/(2 nproc) */
733         x = 1.0 - 0.5 / (double)nproc;
734 
735 #ifdef KMP_DEBUG
736         { // test natural alignment
737           struct _test_a {
738             char a;
739             union {
740               char b;
741               DBL d;
742             };
743           } t;
744           ptrdiff_t natural_alignment =
745               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
746           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
747           // long)natural_alignment );
748           KMP_DEBUG_ASSERT(
749               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
750         }
751 #endif // KMP_DEBUG
752 
753         /* save the term in thread private dispatch structure */
754         *(DBL *)&pr->u.p.parm3 = x;
755 
756         /* solve for the crossover point to the nearest integer i for which C_i
757            <= chunk */
758         {
759           UT left, right, mid;
760           long double p;
761 
762           /* estimate initial upper and lower bound */
763 
764           /* doesn't matter what value right is as long as it is positive, but
765              it affects performance of the solver */
766           right = 229;
767           p = __kmp_pow<UT>(x, right);
768           if (p > target) {
769             do {
770               p *= p;
771               right <<= 1;
772             } while (p > target && right < (1 << 27));
773             /* lower bound is previous (failed) estimate of upper bound */
774             left = right >> 1;
775           } else {
776             left = 0;
777           }
778 
779           /* bisection root-finding method */
780           while (left + 1 < right) {
781             mid = (left + right) / 2;
782             if (__kmp_pow<UT>(x, mid) > target) {
783               left = mid;
784             } else {
785               right = mid;
786             }
787           } // while
788           cross = right;
789         }
790         /* assert sanity of computed crossover point */
791         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
792                    __kmp_pow<UT>(x, cross) <= target);
793 
794         /* save the crossover point in thread private dispatch structure */
795         pr->u.p.parm2 = cross;
796 
797 // C75803
798 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
799 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
800 #else
801 #define GUIDED_ANALYTICAL_WORKAROUND (x)
802 #endif
803         /* dynamic-style scheduling offset */
804         pr->u.p.count = tc -
805                         __kmp_dispatch_guided_remaining(
806                             tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
807                         cross * chunk;
808 #if KMP_USE_X87CONTROL
809         // restore FPCW
810         _control87(oldFpcw, _MCW_PC);
811 #endif
812       } // if
813     } else {
814       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
815                      "kmp_sch_static_greedy\n",
816                      gtid));
817       schedule = kmp_sch_static_greedy;
818       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
819       pr->u.p.parm1 = tc;
820     } // if
821   } // case
822   break;
823   case kmp_sch_static_greedy:
824     KD_TRACE(
825         100,
826         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
827          gtid));
828     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
829     break;
830   case kmp_sch_static_chunked:
831   case kmp_sch_dynamic_chunked:
832   dynamic_init:
833     if (tc == 0)
834       break;
835     if (pr->u.p.parm1 <= 0)
836       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
837     else if (pr->u.p.parm1 > tc)
838       pr->u.p.parm1 = tc;
839     // Store the total number of chunks to prevent integer overflow during
840     // bounds calculations in the get next chunk routine.
841     pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
842     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
843                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
844                    gtid));
845     break;
846   case kmp_sch_trapezoidal: {
847     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
848 
849     T parm1, parm2, parm3, parm4;
850     KD_TRACE(100,
851              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
852               gtid));
853 
854     parm1 = chunk;
855 
856     /* F : size of the first cycle */
857     parm2 = (tc / (2 * nproc));
858 
859     if (parm2 < 1) {
860       parm2 = 1;
861     }
862 
863     /* L : size of the last cycle.  Make sure the last cycle is not larger
864        than the first cycle. */
865     if (parm1 < 1) {
866       parm1 = 1;
867     } else if (parm1 > parm2) {
868       parm1 = parm2;
869     }
870 
871     /* N : number of cycles */
872     parm3 = (parm2 + parm1);
873     parm3 = (2 * tc + parm3 - 1) / parm3;
874 
875     if (parm3 < 2) {
876       parm3 = 2;
877     }
878 
879     /* sigma : decreasing incr of the trapezoid */
880     parm4 = (parm3 - 1);
881     parm4 = (parm2 - parm1) / parm4;
882 
883     // pointless check, because parm4 >= 0 always
884     // if ( parm4 < 0 ) {
885     //    parm4 = 0;
886     //}
887 
888     pr->u.p.parm1 = parm1;
889     pr->u.p.parm2 = parm2;
890     pr->u.p.parm3 = parm3;
891     pr->u.p.parm4 = parm4;
892   } // case
893   break;
894 
895   default: {
896     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
897                 KMP_HNT(GetNewerLibrary), // Hint
898                 __kmp_msg_null // Variadic argument list terminator
899     );
900   } break;
901   } // switch
902   pr->schedule = schedule;
903 }
904 
905 #if KMP_USE_HIER_SCHED
906 template <typename T>
907 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
908                                              typename traits_t<T>::signed_t st);
909 template <>
910 inline void
911 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
912                                             kmp_int32 ub, kmp_int32 st) {
913   __kmp_dispatch_init_hierarchy<kmp_int32>(
914       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
915       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
916 }
917 template <>
918 inline void
919 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
920                                              kmp_uint32 ub, kmp_int32 st) {
921   __kmp_dispatch_init_hierarchy<kmp_uint32>(
922       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
923       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
924 }
925 template <>
926 inline void
927 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
928                                             kmp_int64 ub, kmp_int64 st) {
929   __kmp_dispatch_init_hierarchy<kmp_int64>(
930       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
931       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
932 }
933 template <>
934 inline void
935 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
936                                              kmp_uint64 ub, kmp_int64 st) {
937   __kmp_dispatch_init_hierarchy<kmp_uint64>(
938       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
939       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
940 }
941 
942 // free all the hierarchy scheduling memory associated with the team
943 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
944   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
945   for (int i = 0; i < num_disp_buff; ++i) {
946     // type does not matter here so use kmp_int32
947     auto sh =
948         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
949             &team->t.t_disp_buffer[i]);
950     if (sh->hier) {
951       sh->hier->deallocate();
952       __kmp_free(sh->hier);
953     }
954   }
955 }
956 #endif
957 
958 // UT - unsigned flavor of T, ST - signed flavor of T,
959 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
960 template <typename T>
961 static void
962 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
963                     T ub, typename traits_t<T>::signed_t st,
964                     typename traits_t<T>::signed_t chunk, int push_ws) {
965   typedef typename traits_t<T>::unsigned_t UT;
966 
967   int active;
968   kmp_info_t *th;
969   kmp_team_t *team;
970   kmp_uint32 my_buffer_index;
971   dispatch_private_info_template<T> *pr;
972   dispatch_shared_info_template<T> volatile *sh;
973 
974   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
975                    sizeof(dispatch_private_info));
976   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
977                    sizeof(dispatch_shared_info));
978   __kmp_assert_valid_gtid(gtid);
979 
980   if (!TCR_4(__kmp_init_parallel))
981     __kmp_parallel_initialize();
982 
983   __kmp_resume_if_soft_paused();
984 
985 #if INCLUDE_SSC_MARKS
986   SSC_MARK_DISPATCH_INIT();
987 #endif
988 #ifdef KMP_DEBUG
989   typedef typename traits_t<T>::signed_t ST;
990   {
991     char *buff;
992     // create format specifiers before the debug output
993     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
994                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
995                             traits_t<ST>::spec, traits_t<T>::spec,
996                             traits_t<T>::spec, traits_t<ST>::spec);
997     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
998     __kmp_str_free(&buff);
999   }
1000 #endif
1001   /* setup data */
1002   th = __kmp_threads[gtid];
1003   team = th->th.th_team;
1004   active = !team->t.t_serialized;
1005   th->th.th_ident = loc;
1006 
1007   // Any half-decent optimizer will remove this test when the blocks are empty
1008   // since the macros expand to nothing
1009   // when statistics are disabled.
1010   if (schedule == __kmp_static) {
1011     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
1012   } else {
1013     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
1014   }
1015 
1016 #if KMP_USE_HIER_SCHED
1017   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
1018   // Hierarchical scheduling does not work with ordered, so if ordered is
1019   // detected, then revert back to threaded scheduling.
1020   bool ordered;
1021   enum sched_type my_sched = schedule;
1022   my_buffer_index = th->th.th_dispatch->th_disp_index;
1023   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1024       &th->th.th_dispatch
1025            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1026   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
1027   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
1028     my_sched =
1029         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
1030   ordered = (kmp_ord_lower & my_sched);
1031   if (pr->flags.use_hier) {
1032     if (ordered) {
1033       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
1034                      "Disabling hierarchical scheduling.\n",
1035                      gtid));
1036       pr->flags.use_hier = FALSE;
1037     }
1038   }
1039   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
1040     // Don't use hierarchical for ordered parallel loops and don't
1041     // use the runtime hierarchy if one was specified in the program
1042     if (!ordered && !pr->flags.use_hier)
1043       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
1044   }
1045 #endif // KMP_USE_HIER_SCHED
1046 
1047 #if USE_ITT_BUILD
1048   kmp_uint64 cur_chunk = chunk;
1049   int itt_need_metadata_reporting =
1050       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
1051       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
1052       team->t.t_active_level == 1;
1053 #endif
1054   if (!active) {
1055     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1056         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1057   } else {
1058     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1059                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1060 
1061     my_buffer_index = th->th.th_dispatch->th_disp_index++;
1062 
1063     /* What happens when number of threads changes, need to resize buffer? */
1064     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1065         &th->th.th_dispatch
1066              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1067     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
1068         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1069     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
1070                   my_buffer_index));
1071     if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
1072       KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
1073                      " sh->buffer_index:%d\n",
1074                      gtid, my_buffer_index, sh->buffer_index));
1075       __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1076                              __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1077       // Note: KMP_WAIT() cannot be used there: buffer index and
1078       // my_buffer_index are *always* 32-bit integers.
1079       KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1080                      "sh->buffer_index:%d\n",
1081                      gtid, my_buffer_index, sh->buffer_index));
1082     }
1083   }
1084 
1085   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
1086 #if USE_ITT_BUILD
1087                                 &cur_chunk,
1088 #endif
1089                                 chunk, (T)th->th.th_team_nproc,
1090                                 (T)th->th.th_info.ds.ds_tid);
1091   if (active) {
1092     if (pr->flags.ordered == 0) {
1093       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
1094       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
1095     } else {
1096       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
1097       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
1098     }
1099     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1100     th->th.th_dispatch->th_dispatch_sh_current =
1101         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
1102 #if USE_ITT_BUILD
1103     if (pr->flags.ordered) {
1104       __kmp_itt_ordered_init(gtid);
1105     }
1106     // Report loop metadata
1107     if (itt_need_metadata_reporting) {
1108       // Only report metadata by primary thread of active team at level 1
1109       kmp_uint64 schedtype = 0;
1110       switch (schedule) {
1111       case kmp_sch_static_chunked:
1112       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1113         break;
1114       case kmp_sch_static_greedy:
1115         cur_chunk = pr->u.p.parm1;
1116         break;
1117       case kmp_sch_dynamic_chunked:
1118         schedtype = 1;
1119         break;
1120       case kmp_sch_guided_iterative_chunked:
1121       case kmp_sch_guided_analytical_chunked:
1122       case kmp_sch_guided_simd:
1123         schedtype = 2;
1124         break;
1125       default:
1126         // Should we put this case under "static"?
1127         // case kmp_sch_static_steal:
1128         schedtype = 3;
1129         break;
1130       }
1131       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
1132     }
1133 #if KMP_USE_HIER_SCHED
1134     if (pr->flags.use_hier) {
1135       pr->u.p.count = 0;
1136       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
1137     }
1138 #endif // KMP_USER_HIER_SCHED
1139 #endif /* USE_ITT_BUILD */
1140   }
1141 
1142 #ifdef KMP_DEBUG
1143   {
1144     char *buff;
1145     // create format specifiers before the debug output
1146     buff = __kmp_str_format(
1147         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1148         "lb:%%%s ub:%%%s"
1149         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1150         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1151         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1152         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1153         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1154         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1155     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
1156                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1157                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1158                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
1159     __kmp_str_free(&buff);
1160   }
1161 #endif
1162 #if OMPT_SUPPORT && OMPT_OPTIONAL
1163   if (ompt_enabled.ompt_callback_work) {
1164     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1165     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1166     ompt_callbacks.ompt_callback(ompt_callback_work)(
1167         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1168         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1169   }
1170 #endif
1171   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1172 }
1173 
1174 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1175  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1176  * every chunk of iterations.  If the ordered section(s) were not executed
1177  * for this iteration (or every iteration in this chunk), we need to set the
1178  * ordered iteration counters so that the next thread can proceed. */
1179 template <typename UT>
1180 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1181   typedef typename traits_t<UT>::signed_t ST;
1182   __kmp_assert_valid_gtid(gtid);
1183   kmp_info_t *th = __kmp_threads[gtid];
1184 
1185   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1186   if (!th->th.th_team->t.t_serialized) {
1187 
1188     dispatch_private_info_template<UT> *pr =
1189         reinterpret_cast<dispatch_private_info_template<UT> *>(
1190             th->th.th_dispatch->th_dispatch_pr_current);
1191     dispatch_shared_info_template<UT> volatile *sh =
1192         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1193             th->th.th_dispatch->th_dispatch_sh_current);
1194     KMP_DEBUG_ASSERT(pr);
1195     KMP_DEBUG_ASSERT(sh);
1196     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1197                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1198 
1199     if (pr->ordered_bumped) {
1200       KD_TRACE(
1201           1000,
1202           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1203            gtid));
1204       pr->ordered_bumped = 0;
1205     } else {
1206       UT lower = pr->u.p.ordered_lower;
1207 
1208 #ifdef KMP_DEBUG
1209       {
1210         char *buff;
1211         // create format specifiers before the debug output
1212         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1213                                 "ordered_iteration:%%%s lower:%%%s\n",
1214                                 traits_t<UT>::spec, traits_t<UT>::spec);
1215         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1216         __kmp_str_free(&buff);
1217       }
1218 #endif
1219 
1220       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1221                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1222       KMP_MB(); /* is this necessary? */
1223 #ifdef KMP_DEBUG
1224       {
1225         char *buff;
1226         // create format specifiers before the debug output
1227         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1228                                 "ordered_iteration:%%%s lower:%%%s\n",
1229                                 traits_t<UT>::spec, traits_t<UT>::spec);
1230         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1231         __kmp_str_free(&buff);
1232       }
1233 #endif
1234 
1235       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1236     } // if
1237   } // if
1238   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1239 }
1240 
1241 #ifdef KMP_GOMP_COMPAT
1242 
1243 template <typename UT>
1244 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1245   typedef typename traits_t<UT>::signed_t ST;
1246   __kmp_assert_valid_gtid(gtid);
1247   kmp_info_t *th = __kmp_threads[gtid];
1248 
1249   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1250   if (!th->th.th_team->t.t_serialized) {
1251     dispatch_private_info_template<UT> *pr =
1252         reinterpret_cast<dispatch_private_info_template<UT> *>(
1253             th->th.th_dispatch->th_dispatch_pr_current);
1254     dispatch_shared_info_template<UT> volatile *sh =
1255         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1256             th->th.th_dispatch->th_dispatch_sh_current);
1257     KMP_DEBUG_ASSERT(pr);
1258     KMP_DEBUG_ASSERT(sh);
1259     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1260                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1261 
1262     UT lower = pr->u.p.ordered_lower;
1263     UT upper = pr->u.p.ordered_upper;
1264     UT inc = upper - lower + 1;
1265 
1266     if (pr->ordered_bumped == inc) {
1267       KD_TRACE(
1268           1000,
1269           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1270            gtid));
1271       pr->ordered_bumped = 0;
1272     } else {
1273       inc -= pr->ordered_bumped;
1274 
1275 #ifdef KMP_DEBUG
1276       {
1277         char *buff;
1278         // create format specifiers before the debug output
1279         buff = __kmp_str_format(
1280             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1281             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1282             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1283         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1284         __kmp_str_free(&buff);
1285       }
1286 #endif
1287 
1288       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1289                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1290 
1291       KMP_MB(); /* is this necessary? */
1292       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1293                       "ordered_bumped to zero\n",
1294                       gtid));
1295       pr->ordered_bumped = 0;
1296 //!!!!! TODO check if the inc should be unsigned, or signed???
1297 #ifdef KMP_DEBUG
1298       {
1299         char *buff;
1300         // create format specifiers before the debug output
1301         buff = __kmp_str_format(
1302             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1303             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1304             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1305             traits_t<UT>::spec);
1306         KD_TRACE(1000,
1307                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1308         __kmp_str_free(&buff);
1309       }
1310 #endif
1311 
1312       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1313     }
1314     //        }
1315   }
1316   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1317 }
1318 
1319 #endif /* KMP_GOMP_COMPAT */
1320 
1321 template <typename T>
1322 int __kmp_dispatch_next_algorithm(int gtid,
1323                                   dispatch_private_info_template<T> *pr,
1324                                   dispatch_shared_info_template<T> volatile *sh,
1325                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1326                                   typename traits_t<T>::signed_t *p_st, T nproc,
1327                                   T tid) {
1328   typedef typename traits_t<T>::unsigned_t UT;
1329   typedef typename traits_t<T>::signed_t ST;
1330   typedef typename traits_t<T>::floating_t DBL;
1331   int status = 0;
1332   bool last = false;
1333   T start;
1334   ST incr;
1335   UT limit, trip, init;
1336   kmp_info_t *th = __kmp_threads[gtid];
1337   kmp_team_t *team = th->th.th_team;
1338 
1339   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1340                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1341   KMP_DEBUG_ASSERT(pr);
1342   KMP_DEBUG_ASSERT(sh);
1343   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1344 #ifdef KMP_DEBUG
1345   {
1346     char *buff;
1347     // create format specifiers before the debug output
1348     buff =
1349         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1350                          "sh:%%p nproc:%%%s tid:%%%s\n",
1351                          traits_t<T>::spec, traits_t<T>::spec);
1352     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1353     __kmp_str_free(&buff);
1354   }
1355 #endif
1356 
1357   // zero trip count
1358   if (pr->u.p.tc == 0) {
1359     KD_TRACE(10,
1360              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1361               "zero status:%d\n",
1362               gtid, status));
1363     return 0;
1364   }
1365 
1366   switch (pr->schedule) {
1367 #if KMP_STATIC_STEAL_ENABLED
1368   case kmp_sch_static_steal: {
1369     T chunk = pr->u.p.parm1;
1370     UT nchunks = pr->u.p.parm2;
1371     KD_TRACE(100,
1372              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1373               gtid));
1374 
1375     trip = pr->u.p.tc - 1;
1376 
1377     if (traits_t<T>::type_size > 4) {
1378       // use lock for 8-byte induction variable.
1379       // TODO (optional): check presence and use 16-byte CAS
1380       kmp_lock_t *lck = pr->u.p.steal_lock;
1381       KMP_DEBUG_ASSERT(lck != NULL);
1382       if (pr->u.p.count < (UT)pr->u.p.ub) {
1383         KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1384         __kmp_acquire_lock(lck, gtid);
1385         // try to get own chunk of iterations
1386         init = (pr->u.p.count)++;
1387         status = (init < (UT)pr->u.p.ub);
1388         __kmp_release_lock(lck, gtid);
1389       } else {
1390         status = 0; // no own chunks
1391       }
1392       if (!status) { // try to steal
1393         kmp_lock_t *lckv; // victim buffer's lock
1394         T while_limit = pr->u.p.parm3;
1395         T while_index = 0;
1396         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1397                   __kmp_dispatch_num_buffers; // current loop index
1398         // note: victim thread can potentially execute another loop
1399         KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1400         while ((!status) && (while_limit != ++while_index)) {
1401           dispatch_private_info_template<T> *v;
1402           T remaining;
1403           T victimId = pr->u.p.parm4;
1404           T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1405           v = reinterpret_cast<dispatch_private_info_template<T> *>(
1406               &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1407           KMP_DEBUG_ASSERT(v);
1408           while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1409                  oldVictimId != victimId) {
1410             victimId = (victimId + 1) % nproc;
1411             v = reinterpret_cast<dispatch_private_info_template<T> *>(
1412                 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1413             KMP_DEBUG_ASSERT(v);
1414           }
1415           if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1416             continue; // try once more (nproc attempts in total)
1417           }
1418           if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1419             kmp_uint32 old = UNUSED;
1420             // try to steal whole range from inactive victim
1421             status = v->steal_flag.compare_exchange_strong(old, THIEF);
1422             if (status) {
1423               // initialize self buffer with victim's whole range of chunks
1424               T id = victimId;
1425               T small_chunk = 0, extras = 0, p_extra = 0;
1426               __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1427                                               init, small_chunk, extras,
1428                                               p_extra);
1429               __kmp_acquire_lock(lck, gtid);
1430               pr->u.p.count = init + 1; // exclude one we execute immediately
1431               pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
1432               __kmp_release_lock(lck, gtid);
1433               pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1434               // no need to reinitialize other thread invariants: lb, st, etc.
1435 #ifdef KMP_DEBUG
1436               {
1437                 char *buff;
1438                 // create format specifiers before the debug output
1439                 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1440                                         "stolen chunks from T#%%d, "
1441                                         "count:%%%s ub:%%%s\n",
1442                                         traits_t<UT>::spec, traits_t<T>::spec);
1443                 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1444                 __kmp_str_free(&buff);
1445               }
1446 #endif
1447               // activate non-empty buffer and let others steal from us
1448               if (pr->u.p.count < (UT)pr->u.p.ub)
1449                 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1450               break;
1451             }
1452           }
1453           if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1454               v->u.p.count >= (UT)v->u.p.ub) {
1455             pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1456             continue; // no chunks to steal, try next victim
1457           }
1458           lckv = v->u.p.steal_lock;
1459           KMP_ASSERT(lckv != NULL);
1460           __kmp_acquire_lock(lckv, gtid);
1461           limit = v->u.p.ub; // keep initial ub
1462           if (v->u.p.count >= limit) {
1463             __kmp_release_lock(lckv, gtid);
1464             pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1465             continue; // no chunks to steal, try next victim
1466           }
1467 
1468           // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1469           // TODO: is this heuristics good enough??
1470           remaining = limit - v->u.p.count;
1471           if (remaining > 7) {
1472             // steal 1/4 of remaining
1473             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1474             init = (v->u.p.ub -= (remaining >> 2));
1475           } else {
1476             // steal 1 chunk of 1..7 remaining
1477             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1478             init = (v->u.p.ub -= 1);
1479           }
1480           __kmp_release_lock(lckv, gtid);
1481 #ifdef KMP_DEBUG
1482           {
1483             char *buff;
1484             // create format specifiers before the debug output
1485             buff = __kmp_str_format(
1486                 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1487                 "count:%%%s ub:%%%s\n",
1488                 traits_t<UT>::spec, traits_t<UT>::spec);
1489             KD_TRACE(10, (buff, gtid, victimId, init, limit));
1490             __kmp_str_free(&buff);
1491           }
1492 #endif
1493           KMP_DEBUG_ASSERT(init + 1 <= limit);
1494           pr->u.p.parm4 = victimId; // remember victim to steal from
1495           status = 1;
1496           // now update own count and ub with stolen range excluding init chunk
1497           __kmp_acquire_lock(lck, gtid);
1498           pr->u.p.count = init + 1;
1499           pr->u.p.ub = limit;
1500           __kmp_release_lock(lck, gtid);
1501           // activate non-empty buffer and let others steal from us
1502           if (init + 1 < limit)
1503             KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1504         } // while (search for victim)
1505       } // if (try to find victim and steal)
1506     } else {
1507       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1508       // as all operations on pair (count, ub) must be done atomically
1509       typedef union {
1510         struct {
1511           UT count;
1512           T ub;
1513         } p;
1514         kmp_int64 b;
1515       } union_i4;
1516       union_i4 vold, vnew;
1517       if (pr->u.p.count < (UT)pr->u.p.ub) {
1518         KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1519         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1520         vnew.b = vold.b;
1521         vnew.p.count++; // get chunk from head of self range
1522         while (!KMP_COMPARE_AND_STORE_REL64(
1523             (volatile kmp_int64 *)&pr->u.p.count,
1524             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1525             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1526           KMP_CPU_PAUSE();
1527           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1528           vnew.b = vold.b;
1529           vnew.p.count++;
1530         }
1531         init = vold.p.count;
1532         status = (init < (UT)vold.p.ub);
1533       } else {
1534         status = 0; // no own chunks
1535       }
1536       if (!status) { // try to steal
1537         T while_limit = pr->u.p.parm3;
1538         T while_index = 0;
1539         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1540                   __kmp_dispatch_num_buffers; // current loop index
1541         // note: victim thread can potentially execute another loop
1542         KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1543         while ((!status) && (while_limit != ++while_index)) {
1544           dispatch_private_info_template<T> *v;
1545           T remaining;
1546           T victimId = pr->u.p.parm4;
1547           T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1548           v = reinterpret_cast<dispatch_private_info_template<T> *>(
1549               &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1550           KMP_DEBUG_ASSERT(v);
1551           while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1552                  oldVictimId != victimId) {
1553             victimId = (victimId + 1) % nproc;
1554             v = reinterpret_cast<dispatch_private_info_template<T> *>(
1555                 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1556             KMP_DEBUG_ASSERT(v);
1557           }
1558           if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1559             continue; // try once more (nproc attempts in total)
1560           }
1561           if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1562             kmp_uint32 old = UNUSED;
1563             // try to steal whole range from inactive victim
1564             status = v->steal_flag.compare_exchange_strong(old, THIEF);
1565             if (status) {
1566               // initialize self buffer with victim's whole range of chunks
1567               T id = victimId;
1568               T small_chunk = 0, extras = 0, p_extra = 0;
1569               __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1570                                               init, small_chunk, extras,
1571                                               p_extra);
1572               vnew.p.count = init + 1;
1573               vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
1574               // write pair (count, ub) at once atomically
1575 #if KMP_ARCH_X86
1576               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1577 #else
1578               *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1579 #endif
1580               pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1581               // no need to initialize other thread invariants: lb, st, etc.
1582 #ifdef KMP_DEBUG
1583               {
1584                 char *buff;
1585                 // create format specifiers before the debug output
1586                 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1587                                         "stolen chunks from T#%%d, "
1588                                         "count:%%%s ub:%%%s\n",
1589                                         traits_t<UT>::spec, traits_t<T>::spec);
1590                 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1591                 __kmp_str_free(&buff);
1592               }
1593 #endif
1594               // activate non-empty buffer and let others steal from us
1595               if (pr->u.p.count < (UT)pr->u.p.ub)
1596                 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1597               break;
1598             }
1599           }
1600           while (1) { // CAS loop with check if victim still has enough chunks
1601             // many threads may be stealing concurrently from same victim
1602             vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
1603             if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1604                 vold.p.count >= (UT)vold.p.ub) {
1605               pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
1606               break; // no chunks to steal, try next victim
1607             }
1608             vnew.b = vold.b;
1609             remaining = vold.p.ub - vold.p.count;
1610             // try to steal 1/4 of remaining
1611             // TODO: is this heuristics good enough??
1612             if (remaining > 7) {
1613               vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
1614             } else {
1615               vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
1616             }
1617             KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1618             if (KMP_COMPARE_AND_STORE_REL64(
1619                     (volatile kmp_int64 *)&v->u.p.count,
1620                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1621                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1622               // stealing succedded
1623 #ifdef KMP_DEBUG
1624               {
1625                 char *buff;
1626                 // create format specifiers before the debug output
1627                 buff = __kmp_str_format(
1628                     "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1629                     "count:%%%s ub:%%%s\n",
1630                     traits_t<T>::spec, traits_t<T>::spec);
1631                 KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1632                 __kmp_str_free(&buff);
1633               }
1634 #endif
1635               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1636                                         vold.p.ub - vnew.p.ub);
1637               status = 1;
1638               pr->u.p.parm4 = victimId; // keep victim id
1639               // now update own count and ub
1640               init = vnew.p.ub;
1641               vold.p.count = init + 1;
1642 #if KMP_ARCH_X86
1643               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1644 #else
1645               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1646 #endif
1647               // activate non-empty buffer and let others steal from us
1648               if (vold.p.count < (UT)vold.p.ub)
1649                 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1650               break;
1651             } // if (check CAS result)
1652             KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1653           } // while (try to steal from particular victim)
1654         } // while (search for victim)
1655       } // if (try to find victim and steal)
1656     } // if (4-byte induction variable)
1657     if (!status) {
1658       *p_lb = 0;
1659       *p_ub = 0;
1660       if (p_st != NULL)
1661         *p_st = 0;
1662     } else {
1663       start = pr->u.p.lb;
1664       init *= chunk;
1665       limit = chunk + init - 1;
1666       incr = pr->u.p.st;
1667       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1668 
1669       KMP_DEBUG_ASSERT(init <= trip);
1670       // keep track of done chunks for possible early exit from stealing
1671       // TODO: count executed chunks locally with rare update of shared location
1672       // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1673       if ((last = (limit >= trip)) != 0)
1674         limit = trip;
1675       if (p_st != NULL)
1676         *p_st = incr;
1677 
1678       if (incr == 1) {
1679         *p_lb = start + init;
1680         *p_ub = start + limit;
1681       } else {
1682         *p_lb = start + init * incr;
1683         *p_ub = start + limit * incr;
1684       }
1685     } // if
1686     break;
1687   } // case
1688 #endif // KMP_STATIC_STEAL_ENABLED
1689   case kmp_sch_static_balanced: {
1690     KD_TRACE(
1691         10,
1692         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1693          gtid));
1694     /* check if thread has any iteration to do */
1695     if ((status = !pr->u.p.count) != 0) {
1696       pr->u.p.count = 1;
1697       *p_lb = pr->u.p.lb;
1698       *p_ub = pr->u.p.ub;
1699       last = (pr->u.p.parm1 != 0);
1700       if (p_st != NULL)
1701         *p_st = pr->u.p.st;
1702     } else { /* no iterations to do */
1703       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1704     }
1705   } // case
1706   break;
1707   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1708                                  merged here */
1709   case kmp_sch_static_chunked: {
1710     T parm1;
1711 
1712     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1713                    "kmp_sch_static_[affinity|chunked] case\n",
1714                    gtid));
1715     parm1 = pr->u.p.parm1;
1716 
1717     trip = pr->u.p.tc - 1;
1718     init = parm1 * (pr->u.p.count + tid);
1719 
1720     if ((status = (init <= trip)) != 0) {
1721       start = pr->u.p.lb;
1722       incr = pr->u.p.st;
1723       limit = parm1 + init - 1;
1724 
1725       if ((last = (limit >= trip)) != 0)
1726         limit = trip;
1727 
1728       if (p_st != NULL)
1729         *p_st = incr;
1730 
1731       pr->u.p.count += nproc;
1732 
1733       if (incr == 1) {
1734         *p_lb = start + init;
1735         *p_ub = start + limit;
1736       } else {
1737         *p_lb = start + init * incr;
1738         *p_ub = start + limit * incr;
1739       }
1740 
1741       if (pr->flags.ordered) {
1742         pr->u.p.ordered_lower = init;
1743         pr->u.p.ordered_upper = limit;
1744       } // if
1745     } // if
1746   } // case
1747   break;
1748 
1749   case kmp_sch_dynamic_chunked: {
1750     UT chunk_number;
1751     UT chunk_size = pr->u.p.parm1;
1752     UT nchunks = pr->u.p.parm2;
1753 
1754     KD_TRACE(
1755         100,
1756         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1757          gtid));
1758 
1759     chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1760     status = (chunk_number < nchunks);
1761     if (!status) {
1762       *p_lb = 0;
1763       *p_ub = 0;
1764       if (p_st != NULL)
1765         *p_st = 0;
1766     } else {
1767       init = chunk_size * chunk_number;
1768       trip = pr->u.p.tc - 1;
1769       start = pr->u.p.lb;
1770       incr = pr->u.p.st;
1771 
1772       if ((last = (trip - init < (UT)chunk_size)))
1773         limit = trip;
1774       else
1775         limit = chunk_size + init - 1;
1776 
1777       if (p_st != NULL)
1778         *p_st = incr;
1779 
1780       if (incr == 1) {
1781         *p_lb = start + init;
1782         *p_ub = start + limit;
1783       } else {
1784         *p_lb = start + init * incr;
1785         *p_ub = start + limit * incr;
1786       }
1787 
1788       if (pr->flags.ordered) {
1789         pr->u.p.ordered_lower = init;
1790         pr->u.p.ordered_upper = limit;
1791       } // if
1792     } // if
1793   } // case
1794   break;
1795 
1796   case kmp_sch_guided_iterative_chunked: {
1797     T chunkspec = pr->u.p.parm1;
1798     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1799                    "iterative case\n",
1800                    gtid));
1801     trip = pr->u.p.tc;
1802     // Start atomic part of calculations
1803     while (1) {
1804       ST remaining; // signed, because can be < 0
1805       init = sh->u.s.iteration; // shared value
1806       remaining = trip - init;
1807       if (remaining <= 0) { // AC: need to compare with 0 first
1808         // nothing to do, don't try atomic op
1809         status = 0;
1810         break;
1811       }
1812       if ((T)remaining <
1813           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1814         // use dynamic-style schedule
1815         // atomically increment iterations, get old value
1816         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1817                                  (ST)chunkspec);
1818         remaining = trip - init;
1819         if (remaining <= 0) {
1820           status = 0; // all iterations got by other threads
1821         } else {
1822           // got some iterations to work on
1823           status = 1;
1824           if ((T)remaining > chunkspec) {
1825             limit = init + chunkspec - 1;
1826           } else {
1827             last = true; // the last chunk
1828             limit = init + remaining - 1;
1829           } // if
1830         } // if
1831         break;
1832       } // if
1833       limit = init + (UT)((double)remaining *
1834                           *(double *)&pr->u.p.parm3); // divide by K*nproc
1835       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1836                                (ST)init, (ST)limit)) {
1837         // CAS was successful, chunk obtained
1838         status = 1;
1839         --limit;
1840         break;
1841       } // if
1842     } // while
1843     if (status != 0) {
1844       start = pr->u.p.lb;
1845       incr = pr->u.p.st;
1846       if (p_st != NULL)
1847         *p_st = incr;
1848       *p_lb = start + init * incr;
1849       *p_ub = start + limit * incr;
1850       if (pr->flags.ordered) {
1851         pr->u.p.ordered_lower = init;
1852         pr->u.p.ordered_upper = limit;
1853       } // if
1854     } else {
1855       *p_lb = 0;
1856       *p_ub = 0;
1857       if (p_st != NULL)
1858         *p_st = 0;
1859     } // if
1860   } // case
1861   break;
1862 
1863   case kmp_sch_guided_simd: {
1864     // same as iterative but curr-chunk adjusted to be multiple of given
1865     // chunk
1866     T chunk = pr->u.p.parm1;
1867     KD_TRACE(100,
1868              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1869               gtid));
1870     trip = pr->u.p.tc;
1871     // Start atomic part of calculations
1872     while (1) {
1873       ST remaining; // signed, because can be < 0
1874       init = sh->u.s.iteration; // shared value
1875       remaining = trip - init;
1876       if (remaining <= 0) { // AC: need to compare with 0 first
1877         status = 0; // nothing to do, don't try atomic op
1878         break;
1879       }
1880       KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
1881       // compare with K*nproc*(chunk+1), K=2 by default
1882       if ((T)remaining < pr->u.p.parm2) {
1883         // use dynamic-style schedule
1884         // atomically increment iterations, get old value
1885         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1886                                  (ST)chunk);
1887         remaining = trip - init;
1888         if (remaining <= 0) {
1889           status = 0; // all iterations got by other threads
1890         } else {
1891           // got some iterations to work on
1892           status = 1;
1893           if ((T)remaining > chunk) {
1894             limit = init + chunk - 1;
1895           } else {
1896             last = true; // the last chunk
1897             limit = init + remaining - 1;
1898           } // if
1899         } // if
1900         break;
1901       } // if
1902       // divide by K*nproc
1903       UT span;
1904       __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1905                          &span);
1906       UT rem = span % chunk;
1907       if (rem) // adjust so that span%chunk == 0
1908         span += chunk - rem;
1909       limit = init + span;
1910       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1911                                (ST)init, (ST)limit)) {
1912         // CAS was successful, chunk obtained
1913         status = 1;
1914         --limit;
1915         break;
1916       } // if
1917     } // while
1918     if (status != 0) {
1919       start = pr->u.p.lb;
1920       incr = pr->u.p.st;
1921       if (p_st != NULL)
1922         *p_st = incr;
1923       *p_lb = start + init * incr;
1924       *p_ub = start + limit * incr;
1925       if (pr->flags.ordered) {
1926         pr->u.p.ordered_lower = init;
1927         pr->u.p.ordered_upper = limit;
1928       } // if
1929     } else {
1930       *p_lb = 0;
1931       *p_ub = 0;
1932       if (p_st != NULL)
1933         *p_st = 0;
1934     } // if
1935   } // case
1936   break;
1937 
1938   case kmp_sch_guided_analytical_chunked: {
1939     T chunkspec = pr->u.p.parm1;
1940     UT chunkIdx;
1941 #if KMP_USE_X87CONTROL
1942     /* for storing original FPCW value for Windows* OS on
1943        IA-32 architecture 8-byte version */
1944     unsigned int oldFpcw;
1945     unsigned int fpcwSet = 0;
1946 #endif
1947     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1948                    "kmp_sch_guided_analytical_chunked case\n",
1949                    gtid));
1950 
1951     trip = pr->u.p.tc;
1952 
1953     KMP_DEBUG_ASSERT(nproc > 1);
1954     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1955 
1956     while (1) { /* this while loop is a safeguard against unexpected zero
1957                    chunk sizes */
1958       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1959       if (chunkIdx >= (UT)pr->u.p.parm2) {
1960         --trip;
1961         /* use dynamic-style scheduling */
1962         init = chunkIdx * chunkspec + pr->u.p.count;
1963         /* need to verify init > 0 in case of overflow in the above
1964          * calculation */
1965         if ((status = (init > 0 && init <= trip)) != 0) {
1966           limit = init + chunkspec - 1;
1967 
1968           if ((last = (limit >= trip)) != 0)
1969             limit = trip;
1970         }
1971         break;
1972       } else {
1973 /* use exponential-style scheduling */
1974 /* The following check is to workaround the lack of long double precision on
1975    Windows* OS.
1976    This check works around the possible effect that init != 0 for chunkIdx == 0.
1977  */
1978 #if KMP_USE_X87CONTROL
1979         /* If we haven't already done so, save original
1980            FPCW and set precision to 64-bit, as Windows* OS
1981            on IA-32 architecture defaults to 53-bit */
1982         if (!fpcwSet) {
1983           oldFpcw = _control87(0, 0);
1984           _control87(_PC_64, _MCW_PC);
1985           fpcwSet = 0x30000;
1986         }
1987 #endif
1988         if (chunkIdx) {
1989           init = __kmp_dispatch_guided_remaining<T>(
1990               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1991           KMP_DEBUG_ASSERT(init);
1992           init = trip - init;
1993         } else
1994           init = 0;
1995         limit = trip - __kmp_dispatch_guided_remaining<T>(
1996                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1997         KMP_ASSERT(init <= limit);
1998         if (init < limit) {
1999           KMP_DEBUG_ASSERT(limit <= trip);
2000           --limit;
2001           status = 1;
2002           break;
2003         } // if
2004       } // if
2005     } // while (1)
2006 #if KMP_USE_X87CONTROL
2007     /* restore FPCW if necessary
2008        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2009     */
2010     if (fpcwSet && (oldFpcw & fpcwSet))
2011       _control87(oldFpcw, _MCW_PC);
2012 #endif
2013     if (status != 0) {
2014       start = pr->u.p.lb;
2015       incr = pr->u.p.st;
2016       if (p_st != NULL)
2017         *p_st = incr;
2018       *p_lb = start + init * incr;
2019       *p_ub = start + limit * incr;
2020       if (pr->flags.ordered) {
2021         pr->u.p.ordered_lower = init;
2022         pr->u.p.ordered_upper = limit;
2023       }
2024     } else {
2025       *p_lb = 0;
2026       *p_ub = 0;
2027       if (p_st != NULL)
2028         *p_st = 0;
2029     }
2030   } // case
2031   break;
2032 
2033   case kmp_sch_trapezoidal: {
2034     UT index;
2035     T parm2 = pr->u.p.parm2;
2036     T parm3 = pr->u.p.parm3;
2037     T parm4 = pr->u.p.parm4;
2038     KD_TRACE(100,
2039              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
2040               gtid));
2041 
2042     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2043 
2044     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2045     trip = pr->u.p.tc - 1;
2046 
2047     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2048       *p_lb = 0;
2049       *p_ub = 0;
2050       if (p_st != NULL)
2051         *p_st = 0;
2052     } else {
2053       start = pr->u.p.lb;
2054       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2055       incr = pr->u.p.st;
2056 
2057       if ((last = (limit >= trip)) != 0)
2058         limit = trip;
2059 
2060       if (p_st != NULL)
2061         *p_st = incr;
2062 
2063       if (incr == 1) {
2064         *p_lb = start + init;
2065         *p_ub = start + limit;
2066       } else {
2067         *p_lb = start + init * incr;
2068         *p_ub = start + limit * incr;
2069       }
2070 
2071       if (pr->flags.ordered) {
2072         pr->u.p.ordered_lower = init;
2073         pr->u.p.ordered_upper = limit;
2074       } // if
2075     } // if
2076   } // case
2077   break;
2078   default: {
2079     status = 0; // to avoid complaints on uninitialized variable use
2080     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
2081                 KMP_HNT(GetNewerLibrary), // Hint
2082                 __kmp_msg_null // Variadic argument list terminator
2083     );
2084   } break;
2085   } // switch
2086   if (p_last)
2087     *p_last = last;
2088 #ifdef KMP_DEBUG
2089   if (pr->flags.ordered) {
2090     char *buff;
2091     // create format specifiers before the debug output
2092     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
2093                             "ordered_lower:%%%s ordered_upper:%%%s\n",
2094                             traits_t<UT>::spec, traits_t<UT>::spec);
2095     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
2096     __kmp_str_free(&buff);
2097   }
2098   {
2099     char *buff;
2100     // create format specifiers before the debug output
2101     buff = __kmp_str_format(
2102         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
2103         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
2104         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2105     KMP_DEBUG_ASSERT(p_last);
2106     KMP_DEBUG_ASSERT(p_st);
2107     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
2108     __kmp_str_free(&buff);
2109   }
2110 #endif
2111   return status;
2112 }
2113 
2114 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
2115    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
2116    is not called. */
2117 #if OMPT_SUPPORT && OMPT_OPTIONAL
2118 #define OMPT_LOOP_END                                                          \
2119   if (status == 0) {                                                           \
2120     if (ompt_enabled.ompt_callback_work) {                                     \
2121       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
2122       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
2123       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
2124           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
2125           &(task_info->task_data), 0, codeptr);                                \
2126     }                                                                          \
2127   }
2128 #define OMPT_LOOP_DISPATCH(lb, ub, st, status)                                 \
2129   if (ompt_enabled.ompt_callback_dispatch && status) {                         \
2130     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);                \
2131     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);              \
2132     ompt_dispatch_chunk_t chunk;                                               \
2133     ompt_data_t instance = ompt_data_none;                                     \
2134     OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st);                                \
2135     instance.ptr = &chunk;                                                     \
2136     ompt_callbacks.ompt_callback(ompt_callback_dispatch)(                      \
2137         &(team_info->parallel_data), &(task_info->task_data),                  \
2138         ompt_dispatch_ws_loop_chunk, instance);                                \
2139   }
2140 // TODO: implement count
2141 #else
2142 #define OMPT_LOOP_END // no-op
2143 #define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op
2144 #endif
2145 
2146 #if KMP_STATS_ENABLED
2147 #define KMP_STATS_LOOP_END                                                     \
2148   {                                                                            \
2149     kmp_int64 u, l, t, i;                                                      \
2150     l = (kmp_int64)(*p_lb);                                                    \
2151     u = (kmp_int64)(*p_ub);                                                    \
2152     i = (kmp_int64)(pr->u.p.st);                                               \
2153     if (status == 0) {                                                         \
2154       t = 0;                                                                   \
2155       KMP_POP_PARTITIONED_TIMER();                                             \
2156     } else if (i == 1) {                                                       \
2157       if (u >= l)                                                              \
2158         t = u - l + 1;                                                         \
2159       else                                                                     \
2160         t = 0;                                                                 \
2161     } else if (i < 0) {                                                        \
2162       if (l >= u)                                                              \
2163         t = (l - u) / (-i) + 1;                                                \
2164       else                                                                     \
2165         t = 0;                                                                 \
2166     } else {                                                                   \
2167       if (u >= l)                                                              \
2168         t = (u - l) / i + 1;                                                   \
2169       else                                                                     \
2170         t = 0;                                                                 \
2171     }                                                                          \
2172     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
2173   }
2174 #else
2175 #define KMP_STATS_LOOP_END /* Nothing */
2176 #endif
2177 
2178 template <typename T>
2179 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
2180                                T *p_lb, T *p_ub,
2181                                typename traits_t<T>::signed_t *p_st
2182 #if OMPT_SUPPORT && OMPT_OPTIONAL
2183                                ,
2184                                void *codeptr
2185 #endif
2186 ) {
2187 
2188   typedef typename traits_t<T>::unsigned_t UT;
2189   typedef typename traits_t<T>::signed_t ST;
2190   // This is potentially slightly misleading, schedule(runtime) will appear here
2191   // even if the actual runtime schedule is static. (Which points out a
2192   // disadvantage of schedule(runtime): even when static scheduling is used it
2193   // costs more than a compile time choice to use static scheduling would.)
2194   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2195 
2196   int status;
2197   dispatch_private_info_template<T> *pr;
2198   __kmp_assert_valid_gtid(gtid);
2199   kmp_info_t *th = __kmp_threads[gtid];
2200   kmp_team_t *team = th->th.th_team;
2201 
2202   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2203   KD_TRACE(
2204       1000,
2205       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2206        gtid, p_lb, p_ub, p_st, p_last));
2207 
2208   if (team->t.t_serialized) {
2209     /* NOTE: serialize this dispatch because we are not at the active level */
2210     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2211         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
2212     KMP_DEBUG_ASSERT(pr);
2213 
2214     if ((status = (pr->u.p.tc != 0)) == 0) {
2215       *p_lb = 0;
2216       *p_ub = 0;
2217       //            if ( p_last != NULL )
2218       //                *p_last = 0;
2219       if (p_st != NULL)
2220         *p_st = 0;
2221       if (__kmp_env_consistency_check) {
2222         if (pr->pushed_ws != ct_none) {
2223           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2224         }
2225       }
2226     } else if (pr->flags.nomerge) {
2227       kmp_int32 last;
2228       T start;
2229       UT limit, trip, init;
2230       ST incr;
2231       T chunk = pr->u.p.parm1;
2232 
2233       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2234                      gtid));
2235 
2236       init = chunk * pr->u.p.count++;
2237       trip = pr->u.p.tc - 1;
2238 
2239       if ((status = (init <= trip)) == 0) {
2240         *p_lb = 0;
2241         *p_ub = 0;
2242         //                if ( p_last != NULL )
2243         //                    *p_last = 0;
2244         if (p_st != NULL)
2245           *p_st = 0;
2246         if (__kmp_env_consistency_check) {
2247           if (pr->pushed_ws != ct_none) {
2248             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2249           }
2250         }
2251       } else {
2252         start = pr->u.p.lb;
2253         limit = chunk + init - 1;
2254         incr = pr->u.p.st;
2255 
2256         if ((last = (limit >= trip)) != 0) {
2257           limit = trip;
2258 #if KMP_OS_WINDOWS
2259           pr->u.p.last_upper = pr->u.p.ub;
2260 #endif /* KMP_OS_WINDOWS */
2261         }
2262         if (p_last != NULL)
2263           *p_last = last;
2264         if (p_st != NULL)
2265           *p_st = incr;
2266         if (incr == 1) {
2267           *p_lb = start + init;
2268           *p_ub = start + limit;
2269         } else {
2270           *p_lb = start + init * incr;
2271           *p_ub = start + limit * incr;
2272         }
2273 
2274         if (pr->flags.ordered) {
2275           pr->u.p.ordered_lower = init;
2276           pr->u.p.ordered_upper = limit;
2277 #ifdef KMP_DEBUG
2278           {
2279             char *buff;
2280             // create format specifiers before the debug output
2281             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2282                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
2283                                     traits_t<UT>::spec, traits_t<UT>::spec);
2284             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2285                             pr->u.p.ordered_upper));
2286             __kmp_str_free(&buff);
2287           }
2288 #endif
2289         } // if
2290       } // if
2291     } else {
2292       pr->u.p.tc = 0;
2293       *p_lb = pr->u.p.lb;
2294       *p_ub = pr->u.p.ub;
2295 #if KMP_OS_WINDOWS
2296       pr->u.p.last_upper = *p_ub;
2297 #endif /* KMP_OS_WINDOWS */
2298       if (p_last != NULL)
2299         *p_last = TRUE;
2300       if (p_st != NULL)
2301         *p_st = pr->u.p.st;
2302     } // if
2303 #ifdef KMP_DEBUG
2304     {
2305       char *buff;
2306       // create format specifiers before the debug output
2307       buff = __kmp_str_format(
2308           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2309           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2310           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2311       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2312                     (p_last ? *p_last : 0), status));
2313       __kmp_str_free(&buff);
2314     }
2315 #endif
2316 #if INCLUDE_SSC_MARKS
2317     SSC_MARK_DISPATCH_NEXT();
2318 #endif
2319     OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2320     OMPT_LOOP_END;
2321     KMP_STATS_LOOP_END;
2322     return status;
2323   } else {
2324     kmp_int32 last = 0;
2325     dispatch_shared_info_template<T> volatile *sh;
2326 
2327     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2328                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2329 
2330     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2331         th->th.th_dispatch->th_dispatch_pr_current);
2332     KMP_DEBUG_ASSERT(pr);
2333     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2334         th->th.th_dispatch->th_dispatch_sh_current);
2335     KMP_DEBUG_ASSERT(sh);
2336 
2337 #if KMP_USE_HIER_SCHED
2338     if (pr->flags.use_hier)
2339       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2340     else
2341 #endif // KMP_USE_HIER_SCHED
2342       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2343                                                 p_st, th->th.th_team_nproc,
2344                                                 th->th.th_info.ds.ds_tid);
2345     // status == 0: no more iterations to execute
2346     if (status == 0) {
2347       ST num_done;
2348       num_done = test_then_inc<ST>(&sh->u.s.num_done);
2349 #ifdef KMP_DEBUG
2350       {
2351         char *buff;
2352         // create format specifiers before the debug output
2353         buff = __kmp_str_format(
2354             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2355             traits_t<ST>::spec);
2356         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2357         __kmp_str_free(&buff);
2358       }
2359 #endif
2360 
2361 #if KMP_USE_HIER_SCHED
2362       pr->flags.use_hier = FALSE;
2363 #endif
2364       if (num_done == th->th.th_team_nproc - 1) {
2365 #if KMP_STATIC_STEAL_ENABLED
2366         if (pr->schedule == kmp_sch_static_steal) {
2367           int i;
2368           int idx = (th->th.th_dispatch->th_disp_index - 1) %
2369                     __kmp_dispatch_num_buffers; // current loop index
2370           // loop complete, safe to destroy locks used for stealing
2371           for (i = 0; i < th->th.th_team_nproc; ++i) {
2372             dispatch_private_info_template<T> *buf =
2373                 reinterpret_cast<dispatch_private_info_template<T> *>(
2374                     &team->t.t_dispatch[i].th_disp_buffer[idx]);
2375             KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2376             KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2377             if (traits_t<T>::type_size > 4) {
2378               // destroy locks used for stealing
2379               kmp_lock_t *lck = buf->u.p.steal_lock;
2380               KMP_ASSERT(lck != NULL);
2381               __kmp_destroy_lock(lck);
2382               __kmp_free(lck);
2383               buf->u.p.steal_lock = NULL;
2384             }
2385           }
2386         }
2387 #endif
2388         /* NOTE: release shared buffer to be reused */
2389 
2390         KMP_MB(); /* Flush all pending memory write invalidates.  */
2391 
2392         sh->u.s.num_done = 0;
2393         sh->u.s.iteration = 0;
2394 
2395         /* TODO replace with general release procedure? */
2396         if (pr->flags.ordered) {
2397           sh->u.s.ordered_iteration = 0;
2398         }
2399 
2400         sh->buffer_index += __kmp_dispatch_num_buffers;
2401         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2402                        gtid, sh->buffer_index));
2403 
2404         KMP_MB(); /* Flush all pending memory write invalidates.  */
2405 
2406       } // if
2407       if (__kmp_env_consistency_check) {
2408         if (pr->pushed_ws != ct_none) {
2409           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2410         }
2411       }
2412 
2413       th->th.th_dispatch->th_deo_fcn = NULL;
2414       th->th.th_dispatch->th_dxo_fcn = NULL;
2415       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2416       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2417     } // if (status == 0)
2418 #if KMP_OS_WINDOWS
2419     else if (last) {
2420       pr->u.p.last_upper = pr->u.p.ub;
2421     }
2422 #endif /* KMP_OS_WINDOWS */
2423     if (p_last != NULL && status != 0)
2424       *p_last = last;
2425   } // if
2426 
2427 #ifdef KMP_DEBUG
2428   {
2429     char *buff;
2430     // create format specifiers before the debug output
2431     buff = __kmp_str_format(
2432         "__kmp_dispatch_next: T#%%d normal case: "
2433         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2434         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2435     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2436                   (p_last ? *p_last : 0), status));
2437     __kmp_str_free(&buff);
2438   }
2439 #endif
2440 #if INCLUDE_SSC_MARKS
2441   SSC_MARK_DISPATCH_NEXT();
2442 #endif
2443   OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2444   OMPT_LOOP_END;
2445   KMP_STATS_LOOP_END;
2446   return status;
2447 }
2448 
2449 /*!
2450 @ingroup WORK_SHARING
2451 @param loc  source location information
2452 @param global_tid  global thread number
2453 @return Zero if the parallel region is not active and this thread should execute
2454 all sections, non-zero otherwise.
2455 
2456 Beginning of sections construct.
2457 There are no implicit barriers in the "sections" calls, rather the compiler
2458 should introduce an explicit barrier if it is required.
2459 
2460 This implementation is based on __kmp_dispatch_init, using same constructs for
2461 shared data (we can't have sections nested directly in omp for loop, there
2462 should be a parallel region in between)
2463 */
2464 kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {
2465 
2466   int active;
2467   kmp_info_t *th;
2468   kmp_team_t *team;
2469   kmp_uint32 my_buffer_index;
2470   dispatch_shared_info_template<kmp_int32> volatile *sh;
2471 
2472   KMP_DEBUG_ASSERT(__kmp_init_serial);
2473 
2474   if (!TCR_4(__kmp_init_parallel))
2475     __kmp_parallel_initialize();
2476   __kmp_resume_if_soft_paused();
2477 
2478   /* setup data */
2479   th = __kmp_threads[gtid];
2480   team = th->th.th_team;
2481   active = !team->t.t_serialized;
2482   th->th.th_ident = loc;
2483 
2484   KMP_COUNT_BLOCK(OMP_SECTIONS);
2485   KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));
2486 
2487   if (active) {
2488     // Setup sections in the same way as dynamic scheduled loops.
2489     // We need one shared data: which section is to execute next.
2490     // (in case parallel is not active, all sections will be executed on the
2491     // same thread)
2492     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2493                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2494 
2495     my_buffer_index = th->th.th_dispatch->th_disp_index++;
2496 
2497     // reuse shared data structures from dynamic sched loops:
2498     sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2499         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
2500     KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
2501                   my_buffer_index));
2502 
2503     th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
2504     th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
2505 
2506     KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
2507                    "sh->buffer_index:%d\n",
2508                    gtid, my_buffer_index, sh->buffer_index));
2509     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
2510                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
2511     // Note: KMP_WAIT() cannot be used there: buffer index and
2512     // my_buffer_index are *always* 32-bit integers.
2513     KMP_MB();
2514     KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
2515                    "sh->buffer_index:%d\n",
2516                    gtid, my_buffer_index, sh->buffer_index));
2517 
2518     th->th.th_dispatch->th_dispatch_pr_current =
2519         nullptr; // sections construct doesn't need private data
2520     th->th.th_dispatch->th_dispatch_sh_current =
2521         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
2522   }
2523 
2524 #if OMPT_SUPPORT && OMPT_OPTIONAL
2525   if (ompt_enabled.ompt_callback_work) {
2526     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2527     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2528     ompt_callbacks.ompt_callback(ompt_callback_work)(
2529         ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
2530         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2531   }
2532 #endif
2533   KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
2534 
2535   return active;
2536 }
2537 
2538 /*!
2539 @ingroup WORK_SHARING
2540 @param loc  source location information
2541 @param global_tid  global thread number
2542 @param numberOfSections  number of sections in the 'sections' construct
2543 @return unsigned [from 0 to n) - number (id) of the section to execute next on
2544 this thread. n (or any other number not in range) - nothing to execute on this
2545 thread
2546 */
2547 
2548 kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
2549                               kmp_int32 numberOfSections) {
2550 
2551   KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead);
2552 
2553   kmp_info_t *th = __kmp_threads[gtid];
2554 #ifdef KMP_DEBUG
2555   kmp_team_t *team = th->th.th_team;
2556 #endif
2557 
2558   KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
2559                   numberOfSections));
2560 
2561   // For serialized case we should not call this function:
2562   KMP_DEBUG_ASSERT(!team->t.t_serialized);
2563 
2564   dispatch_shared_info_template<kmp_int32> volatile *sh;
2565 
2566   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2567                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2568 
2569   KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
2570   sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2571       th->th.th_dispatch->th_dispatch_sh_current);
2572   KMP_DEBUG_ASSERT(sh);
2573 
2574   kmp_int32 sectionIndex = 0;
2575   bool moreSectionsToExecute = true;
2576 
2577   // Find section to execute:
2578   sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
2579   if (sectionIndex >= numberOfSections) {
2580     moreSectionsToExecute = false;
2581   }
2582 
2583   // status == 0: no more sections to execute;
2584   // OMPTODO: __kmpc_end_sections could be bypassed?
2585   if (!moreSectionsToExecute) {
2586     kmp_int32 num_done;
2587 
2588     num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
2589 
2590     if (num_done == th->th.th_team_nproc - 1) {
2591       /* NOTE: release this buffer to be reused */
2592 
2593       KMP_MB(); /* Flush all pending memory write invalidates.  */
2594 
2595       sh->u.s.num_done = 0;
2596       sh->u.s.iteration = 0;
2597 
2598       KMP_MB(); /* Flush all pending memory write invalidates.  */
2599 
2600       sh->buffer_index += __kmp_dispatch_num_buffers;
2601       KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
2602                      sh->buffer_index));
2603 
2604       KMP_MB(); /* Flush all pending memory write invalidates.  */
2605 
2606     } // if
2607 
2608     th->th.th_dispatch->th_deo_fcn = NULL;
2609     th->th.th_dispatch->th_dxo_fcn = NULL;
2610     th->th.th_dispatch->th_dispatch_sh_current = NULL;
2611     th->th.th_dispatch->th_dispatch_pr_current = NULL;
2612 
2613 #if OMPT_SUPPORT && OMPT_OPTIONAL
2614     if (ompt_enabled.ompt_callback_dispatch) {
2615       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2616       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2617       ompt_data_t instance = ompt_data_none;
2618       instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
2619       ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
2620           &(team_info->parallel_data), &(task_info->task_data),
2621           ompt_dispatch_section, instance);
2622     }
2623 #endif
2624   }
2625 
2626   return sectionIndex;
2627 }
2628 
2629 /*!
2630 @ingroup WORK_SHARING
2631 @param loc  source location information
2632 @param global_tid  global thread number
2633 
2634 End of "sections" construct.
2635 Don't need to wait here: barrier is added separately when needed.
2636 */
2637 void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
2638 
2639   kmp_info_t *th = __kmp_threads[gtid];
2640   int active = !th->th.th_team->t.t_serialized;
2641 
2642   KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));
2643 
2644   if (!active) {
2645     // In active case call finalization is done in __kmpc_next_section
2646 #if OMPT_SUPPORT && OMPT_OPTIONAL
2647     if (ompt_enabled.ompt_callback_work) {
2648       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2649       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2650       ompt_callbacks.ompt_callback(ompt_callback_work)(
2651           ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
2652           &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2653     }
2654 #endif
2655   }
2656 
2657   KMP_POP_PARTITIONED_TIMER();
2658   KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
2659 }
2660 
2661 template <typename T>
2662 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2663                                   kmp_int32 *plastiter, T *plower, T *pupper,
2664                                   typename traits_t<T>::signed_t incr) {
2665   typedef typename traits_t<T>::unsigned_t UT;
2666   kmp_uint32 team_id;
2667   kmp_uint32 nteams;
2668   UT trip_count;
2669   kmp_team_t *team;
2670   kmp_info_t *th;
2671 
2672   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2673   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2674 #ifdef KMP_DEBUG
2675   typedef typename traits_t<T>::signed_t ST;
2676   {
2677     char *buff;
2678     // create format specifiers before the debug output
2679     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2680                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2681                             traits_t<T>::spec, traits_t<T>::spec,
2682                             traits_t<ST>::spec, traits_t<T>::spec);
2683     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2684     __kmp_str_free(&buff);
2685   }
2686 #endif
2687 
2688   if (__kmp_env_consistency_check) {
2689     if (incr == 0) {
2690       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2691                             loc);
2692     }
2693     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2694       // The loop is illegal.
2695       // Some zero-trip loops maintained by compiler, e.g.:
2696       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2697       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2698       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2699       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2700       // Compiler does not check the following illegal loops:
2701       //   for(i=0;i<10;i+=incr) // where incr<0
2702       //   for(i=10;i>0;i-=incr) // where incr<0
2703       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2704     }
2705   }
2706   __kmp_assert_valid_gtid(gtid);
2707   th = __kmp_threads[gtid];
2708   team = th->th.th_team;
2709   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2710   nteams = th->th.th_teams_size.nteams;
2711   team_id = team->t.t_master_tid;
2712   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2713 
2714   // compute global trip count
2715   if (incr == 1) {
2716     trip_count = *pupper - *plower + 1;
2717   } else if (incr == -1) {
2718     trip_count = *plower - *pupper + 1;
2719   } else if (incr > 0) {
2720     // upper-lower can exceed the limit of signed type
2721     trip_count = (UT)(*pupper - *plower) / incr + 1;
2722   } else {
2723     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2724   }
2725 
2726   if (trip_count <= nteams) {
2727     KMP_DEBUG_ASSERT(
2728         __kmp_static == kmp_sch_static_greedy ||
2729         __kmp_static ==
2730             kmp_sch_static_balanced); // Unknown static scheduling type.
2731     // only some teams get single iteration, others get nothing
2732     if (team_id < trip_count) {
2733       *pupper = *plower = *plower + team_id * incr;
2734     } else {
2735       *plower = *pupper + incr; // zero-trip loop
2736     }
2737     if (plastiter != NULL)
2738       *plastiter = (team_id == trip_count - 1);
2739   } else {
2740     if (__kmp_static == kmp_sch_static_balanced) {
2741       UT chunk = trip_count / nteams;
2742       UT extras = trip_count % nteams;
2743       *plower +=
2744           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2745       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2746       if (plastiter != NULL)
2747         *plastiter = (team_id == nteams - 1);
2748     } else {
2749       T chunk_inc_count =
2750           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2751       T upper = *pupper;
2752       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2753       // Unknown static scheduling type.
2754       *plower += team_id * chunk_inc_count;
2755       *pupper = *plower + chunk_inc_count - incr;
2756       // Check/correct bounds if needed
2757       if (incr > 0) {
2758         if (*pupper < *plower)
2759           *pupper = traits_t<T>::max_value;
2760         if (plastiter != NULL)
2761           *plastiter = *plower <= upper && *pupper > upper - incr;
2762         if (*pupper > upper)
2763           *pupper = upper; // tracker C73258
2764       } else {
2765         if (*pupper > *plower)
2766           *pupper = traits_t<T>::min_value;
2767         if (plastiter != NULL)
2768           *plastiter = *plower >= upper && *pupper < upper - incr;
2769         if (*pupper < upper)
2770           *pupper = upper; // tracker C73258
2771       }
2772     }
2773   }
2774 }
2775 
2776 //-----------------------------------------------------------------------------
2777 // Dispatch routines
2778 //    Transfer call to template< type T >
2779 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2780 //                         T lb, T ub, ST st, ST chunk )
2781 extern "C" {
2782 
2783 /*!
2784 @ingroup WORK_SHARING
2785 @{
2786 @param loc Source location
2787 @param gtid Global thread id
2788 @param schedule Schedule type
2789 @param lb  Lower bound
2790 @param ub  Upper bound
2791 @param st  Step (or increment if you prefer)
2792 @param chunk The chunk size to block with
2793 
2794 This function prepares the runtime to start a dynamically scheduled for loop,
2795 saving the loop arguments.
2796 These functions are all identical apart from the types of the arguments.
2797 */
2798 
2799 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2800                             enum sched_type schedule, kmp_int32 lb,
2801                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2802   KMP_DEBUG_ASSERT(__kmp_init_serial);
2803 #if OMPT_SUPPORT && OMPT_OPTIONAL
2804   OMPT_STORE_RETURN_ADDRESS(gtid);
2805 #endif
2806   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2807 }
2808 /*!
2809 See @ref __kmpc_dispatch_init_4
2810 */
2811 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2812                              enum sched_type schedule, kmp_uint32 lb,
2813                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2814   KMP_DEBUG_ASSERT(__kmp_init_serial);
2815 #if OMPT_SUPPORT && OMPT_OPTIONAL
2816   OMPT_STORE_RETURN_ADDRESS(gtid);
2817 #endif
2818   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2819 }
2820 
2821 /*!
2822 See @ref __kmpc_dispatch_init_4
2823 */
2824 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2825                             enum sched_type schedule, kmp_int64 lb,
2826                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2827   KMP_DEBUG_ASSERT(__kmp_init_serial);
2828 #if OMPT_SUPPORT && OMPT_OPTIONAL
2829   OMPT_STORE_RETURN_ADDRESS(gtid);
2830 #endif
2831   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2832 }
2833 
2834 /*!
2835 See @ref __kmpc_dispatch_init_4
2836 */
2837 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2838                              enum sched_type schedule, kmp_uint64 lb,
2839                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2840   KMP_DEBUG_ASSERT(__kmp_init_serial);
2841 #if OMPT_SUPPORT && OMPT_OPTIONAL
2842   OMPT_STORE_RETURN_ADDRESS(gtid);
2843 #endif
2844   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2845 }
2846 
2847 /*!
2848 See @ref __kmpc_dispatch_init_4
2849 
2850 Difference from __kmpc_dispatch_init set of functions is these functions
2851 are called for composite distribute parallel for construct. Thus before
2852 regular iterations dispatching we need to calc per-team iteration space.
2853 
2854 These functions are all identical apart from the types of the arguments.
2855 */
2856 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2857                                  enum sched_type schedule, kmp_int32 *p_last,
2858                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2859                                  kmp_int32 chunk) {
2860   KMP_DEBUG_ASSERT(__kmp_init_serial);
2861 #if OMPT_SUPPORT && OMPT_OPTIONAL
2862   OMPT_STORE_RETURN_ADDRESS(gtid);
2863 #endif
2864   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2865   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2866 }
2867 
2868 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2869                                   enum sched_type schedule, kmp_int32 *p_last,
2870                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2871                                   kmp_int32 chunk) {
2872   KMP_DEBUG_ASSERT(__kmp_init_serial);
2873 #if OMPT_SUPPORT && OMPT_OPTIONAL
2874   OMPT_STORE_RETURN_ADDRESS(gtid);
2875 #endif
2876   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2877   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2878 }
2879 
2880 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2881                                  enum sched_type schedule, kmp_int32 *p_last,
2882                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2883                                  kmp_int64 chunk) {
2884   KMP_DEBUG_ASSERT(__kmp_init_serial);
2885 #if OMPT_SUPPORT && OMPT_OPTIONAL
2886   OMPT_STORE_RETURN_ADDRESS(gtid);
2887 #endif
2888   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2889   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2890 }
2891 
2892 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2893                                   enum sched_type schedule, kmp_int32 *p_last,
2894                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2895                                   kmp_int64 chunk) {
2896   KMP_DEBUG_ASSERT(__kmp_init_serial);
2897 #if OMPT_SUPPORT && OMPT_OPTIONAL
2898   OMPT_STORE_RETURN_ADDRESS(gtid);
2899 #endif
2900   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2901   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2902 }
2903 
2904 /*!
2905 @param loc Source code location
2906 @param gtid Global thread id
2907 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2908 otherwise
2909 @param p_lb   Pointer to the lower bound for the next chunk of work
2910 @param p_ub   Pointer to the upper bound for the next chunk of work
2911 @param p_st   Pointer to the stride for the next chunk of work
2912 @return one if there is work to be done, zero otherwise
2913 
2914 Get the next dynamically allocated chunk of work for this thread.
2915 If there is no more work, then the lb,ub and stride need not be modified.
2916 */
2917 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2918                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2919 #if OMPT_SUPPORT && OMPT_OPTIONAL
2920   OMPT_STORE_RETURN_ADDRESS(gtid);
2921 #endif
2922   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2923 #if OMPT_SUPPORT && OMPT_OPTIONAL
2924                                         ,
2925                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2926 #endif
2927   );
2928 }
2929 
2930 /*!
2931 See @ref __kmpc_dispatch_next_4
2932 */
2933 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2934                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2935                             kmp_int32 *p_st) {
2936 #if OMPT_SUPPORT && OMPT_OPTIONAL
2937   OMPT_STORE_RETURN_ADDRESS(gtid);
2938 #endif
2939   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2940 #if OMPT_SUPPORT && OMPT_OPTIONAL
2941                                          ,
2942                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2943 #endif
2944   );
2945 }
2946 
2947 /*!
2948 See @ref __kmpc_dispatch_next_4
2949 */
2950 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2951                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2952 #if OMPT_SUPPORT && OMPT_OPTIONAL
2953   OMPT_STORE_RETURN_ADDRESS(gtid);
2954 #endif
2955   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2956 #if OMPT_SUPPORT && OMPT_OPTIONAL
2957                                         ,
2958                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2959 #endif
2960   );
2961 }
2962 
2963 /*!
2964 See @ref __kmpc_dispatch_next_4
2965 */
2966 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2967                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2968                             kmp_int64 *p_st) {
2969 #if OMPT_SUPPORT && OMPT_OPTIONAL
2970   OMPT_STORE_RETURN_ADDRESS(gtid);
2971 #endif
2972   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2973 #if OMPT_SUPPORT && OMPT_OPTIONAL
2974                                          ,
2975                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2976 #endif
2977   );
2978 }
2979 
2980 /*!
2981 @param loc Source code location
2982 @param gtid Global thread id
2983 
2984 Mark the end of a dynamic loop.
2985 */
2986 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2987   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2988 }
2989 
2990 /*!
2991 See @ref __kmpc_dispatch_fini_4
2992 */
2993 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2994   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2995 }
2996 
2997 /*!
2998 See @ref __kmpc_dispatch_fini_4
2999 */
3000 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
3001   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
3002 }
3003 
3004 /*!
3005 See @ref __kmpc_dispatch_fini_4
3006 */
3007 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
3008   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
3009 }
3010 /*! @} */
3011 
3012 //-----------------------------------------------------------------------------
3013 // Non-template routines from kmp_dispatch.cpp used in other sources
3014 
3015 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
3016   return value == checker;
3017 }
3018 
3019 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
3020   return value != checker;
3021 }
3022 
3023 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
3024   return value < checker;
3025 }
3026 
3027 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
3028   return value >= checker;
3029 }
3030 
3031 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
3032   return value <= checker;
3033 }
3034 
3035 kmp_uint32
3036 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
3037              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
3038              void *obj // Higher-level synchronization object, or NULL.
3039 ) {
3040   // note: we may not belong to a team at this point
3041   volatile kmp_uint32 *spin = spinner;
3042   kmp_uint32 check = checker;
3043   kmp_uint32 spins;
3044   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
3045   kmp_uint32 r;
3046   kmp_uint64 time;
3047 
3048   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
3049   KMP_INIT_YIELD(spins);
3050   KMP_INIT_BACKOFF(time);
3051   // main wait spin loop
3052   while (!f(r = TCR_4(*spin), check)) {
3053     KMP_FSYNC_SPIN_PREPARE(obj);
3054     /* GEH - remove this since it was accidentally introduced when kmp_wait was
3055        split. It causes problems with infinite recursion because of exit lock */
3056     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
3057         __kmp_abort_thread(); */
3058     KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3059   }
3060   KMP_FSYNC_SPIN_ACQUIRED(obj);
3061   return r;
3062 }
3063 
3064 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
3065                       kmp_uint32 (*pred)(void *, kmp_uint32),
3066                       void *obj // Higher-level synchronization object, or NULL.
3067 ) {
3068   // note: we may not belong to a team at this point
3069   void *spin = spinner;
3070   kmp_uint32 check = checker;
3071   kmp_uint32 spins;
3072   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
3073   kmp_uint64 time;
3074 
3075   KMP_FSYNC_SPIN_INIT(obj, spin);
3076   KMP_INIT_YIELD(spins);
3077   KMP_INIT_BACKOFF(time);
3078   // main wait spin loop
3079   while (!f(spin, check)) {
3080     KMP_FSYNC_SPIN_PREPARE(obj);
3081     /* if we have waited a bit, or are noversubscribed, yield */
3082     /* pause is in the following code */
3083     KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3084   }
3085   KMP_FSYNC_SPIN_ACQUIRED(obj);
3086 }
3087 
3088 } // extern "C"
3089 
3090 #ifdef KMP_GOMP_COMPAT
3091 
3092 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
3093                                enum sched_type schedule, kmp_int32 lb,
3094                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
3095                                int push_ws) {
3096   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
3097                                  push_ws);
3098 }
3099 
3100 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
3101                                 enum sched_type schedule, kmp_uint32 lb,
3102                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
3103                                 int push_ws) {
3104   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
3105                                   push_ws);
3106 }
3107 
3108 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
3109                                enum sched_type schedule, kmp_int64 lb,
3110                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
3111                                int push_ws) {
3112   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
3113                                  push_ws);
3114 }
3115 
3116 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
3117                                 enum sched_type schedule, kmp_uint64 lb,
3118                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
3119                                 int push_ws) {
3120   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
3121                                   push_ws);
3122 }
3123 
3124 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
3125   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3126 }
3127 
3128 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
3129   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3130 }
3131 
3132 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
3133   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3134 }
3135 
3136 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
3137   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3138 }
3139 
3140 #endif /* KMP_GOMP_COMPAT */
3141 
3142 /* ------------------------------------------------------------------------ */
3143