xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_dispatch.cpp (revision b2d2a78ad80ec68d4a17f5aef97d21686cb1e29b)
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73                                          bool use_hier = false) {
74   // Pick up the nonmonotonic/monotonic bits from the scheduling type
75   // Nonmonotonic as default for dynamic schedule when no modifier is specified
76   int monotonicity = SCHEDULE_NONMONOTONIC;
77 
78   // Let default be monotonic for executables
79   // compiled with OpenMP* 4.5 or less compilers
80   if (loc != NULL && loc->get_openmp_version() < 50)
81     monotonicity = SCHEDULE_MONOTONIC;
82 
83   if (use_hier || __kmp_force_monotonic)
84     monotonicity = SCHEDULE_MONOTONIC;
85   else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86     monotonicity = SCHEDULE_NONMONOTONIC;
87   else if (SCHEDULE_HAS_MONOTONIC(schedule))
88     monotonicity = SCHEDULE_MONOTONIC;
89 
90   return monotonicity;
91 }
92 
93 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
94 // Return floating point number rounded to two decimal points
95 static inline float __kmp_round_2decimal_val(float num) {
96   return (float)(static_cast<int>(num * 100 + 0.5)) / 100;
97 }
98 static inline int __kmp_get_round_val(float num) {
99   return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);
100 }
101 #endif
102 
103 template <typename T>
104 inline void
105 __kmp_initialize_self_buffer(kmp_team_t *team, T id,
106                              dispatch_private_info_template<T> *pr,
107                              typename traits_t<T>::unsigned_t nchunks, T nproc,
108                              typename traits_t<T>::unsigned_t &init,
109                              T &small_chunk, T &extras, T &p_extra) {
110 
111 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
112   if (pr->flags.use_hybrid) {
113     kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];
114     kmp_hw_core_type_t type =
115         (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
116     T pchunks = pr->u.p.pchunks;
117     T echunks = nchunks - pchunks;
118     T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
119     T num_procs_with_ecore = nproc - num_procs_with_pcore;
120     T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
121     T big_chunk =
122         pchunks / num_procs_with_pcore; // chunks per thread with p-core
123     small_chunk =
124         echunks / num_procs_with_ecore; // chunks per thread with e-core
125 
126     extras =
127         (pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);
128 
129     p_extra = (big_chunk - small_chunk);
130 
131     if (type == KMP_HW_CORE_TYPE_CORE) {
132       if (id < first_thread_with_ecore) {
133         init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
134       } else {
135         init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
136                (id < extras ? id : extras);
137       }
138     } else {
139       if (id == first_thread_with_ecore) {
140         init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
141       } else {
142         init = id * small_chunk + first_thread_with_ecore * p_extra +
143                (id < extras ? id : extras);
144       }
145     }
146     p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
147     return;
148   }
149 #endif
150 
151   small_chunk = nchunks / nproc; // chunks per thread
152   extras = nchunks % nproc;
153   p_extra = 0;
154   init = id * small_chunk + (id < extras ? id : extras);
155 }
156 
157 #if KMP_STATIC_STEAL_ENABLED
158 enum { // values for steal_flag (possible states of private per-loop buffer)
159   UNUSED = 0,
160   CLAIMED = 1, // owner thread started initialization
161   READY = 2, // available for stealing
162   THIEF = 3 // finished by owner, or claimed by thief
163   // possible state changes:
164   // 0 -> 1 owner only, sync
165   // 0 -> 3 thief only, sync
166   // 1 -> 2 owner only, async
167   // 2 -> 3 owner only, async
168   // 3 -> 2 owner only, async
169   // 3 -> 0 last thread finishing the loop, async
170 };
171 #endif
172 
173 // Initialize a dispatch_private_info_template<T> buffer for a particular
174 // type of schedule,chunk.  The loop description is found in lb (lower bound),
175 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
176 // to the scheduling (often the number of threads in a team, but not always if
177 // hierarchical scheduling is used).  tid is the id of the thread calling
178 // the function within the group of nproc threads.  It will have a value
179 // between 0 and nproc - 1.  This is often just the thread id within a team, but
180 // is not necessarily the case when using hierarchical scheduling.
181 // loc is the source file location of the corresponding loop
182 // gtid is the global thread id
183 template <typename T>
184 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
185                                    dispatch_private_info_template<T> *pr,
186                                    enum sched_type schedule, T lb, T ub,
187                                    typename traits_t<T>::signed_t st,
188 #if USE_ITT_BUILD
189                                    kmp_uint64 *cur_chunk,
190 #endif
191                                    typename traits_t<T>::signed_t chunk,
192                                    T nproc, T tid) {
193   typedef typename traits_t<T>::unsigned_t UT;
194   typedef typename traits_t<T>::floating_t DBL;
195 
196   int active;
197   T tc;
198   kmp_info_t *th;
199   kmp_team_t *team;
200   int monotonicity;
201   bool use_hier;
202 
203 #ifdef KMP_DEBUG
204   typedef typename traits_t<T>::signed_t ST;
205   {
206     char *buff;
207     // create format specifiers before the debug output
208     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
209                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
210                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
211                             traits_t<T>::spec, traits_t<T>::spec,
212                             traits_t<ST>::spec, traits_t<ST>::spec,
213                             traits_t<T>::spec, traits_t<T>::spec);
214     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
215     __kmp_str_free(&buff);
216   }
217 #endif
218   /* setup data */
219   th = __kmp_threads[gtid];
220   team = th->th.th_team;
221   active = !team->t.t_serialized;
222 
223 #if USE_ITT_BUILD
224   int itt_need_metadata_reporting =
225       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
226       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
227       team->t.t_active_level == 1;
228 #endif
229 
230 #if KMP_USE_HIER_SCHED
231   use_hier = pr->flags.use_hier;
232 #else
233   use_hier = false;
234 #endif
235 
236   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
237   monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
238   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
239 
240   /* Pick up the nomerge/ordered bits from the scheduling type */
241   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
242     pr->flags.nomerge = TRUE;
243     schedule =
244         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
245   } else {
246     pr->flags.nomerge = FALSE;
247   }
248   pr->type_size = traits_t<T>::type_size; // remember the size of variables
249   if (kmp_ord_lower & schedule) {
250     pr->flags.ordered = TRUE;
251     schedule =
252         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
253   } else {
254     pr->flags.ordered = FALSE;
255   }
256   // Ordered overrides nonmonotonic
257   if (pr->flags.ordered) {
258     monotonicity = SCHEDULE_MONOTONIC;
259   }
260 
261   if (schedule == kmp_sch_static) {
262     schedule = __kmp_static;
263   } else {
264     if (schedule == kmp_sch_runtime) {
265       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
266       // not specified)
267       schedule = team->t.t_sched.r_sched_type;
268       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
269       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
270       if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
271         monotonicity = SCHEDULE_MONOTONIC;
272       // Detail the schedule if needed (global controls are differentiated
273       // appropriately)
274       if (schedule == kmp_sch_guided_chunked) {
275         schedule = __kmp_guided;
276       } else if (schedule == kmp_sch_static) {
277         schedule = __kmp_static;
278       }
279       // Use the chunk size specified by OMP_SCHEDULE (or default if not
280       // specified)
281       chunk = team->t.t_sched.chunk;
282 #if USE_ITT_BUILD
283       if (cur_chunk)
284         *cur_chunk = chunk;
285 #endif
286 #ifdef KMP_DEBUG
287       {
288         char *buff;
289         // create format specifiers before the debug output
290         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
291                                 "schedule:%%d chunk:%%%s\n",
292                                 traits_t<ST>::spec);
293         KD_TRACE(10, (buff, gtid, schedule, chunk));
294         __kmp_str_free(&buff);
295       }
296 #endif
297     } else {
298       if (schedule == kmp_sch_guided_chunked) {
299         schedule = __kmp_guided;
300       }
301       if (chunk <= 0) {
302         chunk = KMP_DEFAULT_CHUNK;
303       }
304     }
305 
306     if (schedule == kmp_sch_auto) {
307       // mapping and differentiation: in the __kmp_do_serial_initialize()
308       schedule = __kmp_auto;
309 #ifdef KMP_DEBUG
310       {
311         char *buff;
312         // create format specifiers before the debug output
313         buff = __kmp_str_format(
314             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
315             "schedule:%%d chunk:%%%s\n",
316             traits_t<ST>::spec);
317         KD_TRACE(10, (buff, gtid, schedule, chunk));
318         __kmp_str_free(&buff);
319       }
320 #endif
321     }
322 #if KMP_STATIC_STEAL_ENABLED
323     // map nonmonotonic:dynamic to static steal
324     if (schedule == kmp_sch_dynamic_chunked) {
325       if (monotonicity == SCHEDULE_NONMONOTONIC)
326         schedule = kmp_sch_static_steal;
327     }
328 #endif
329     /* guided analytical not safe for too many threads */
330     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
331       schedule = kmp_sch_guided_iterative_chunked;
332       KMP_WARNING(DispatchManyThreads);
333     }
334     if (schedule == kmp_sch_runtime_simd) {
335       // compiler provides simd_width in the chunk parameter
336       schedule = team->t.t_sched.r_sched_type;
337       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
338       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
339       // Detail the schedule if needed (global controls are differentiated
340       // appropriately)
341       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
342           schedule == __kmp_static) {
343         schedule = kmp_sch_static_balanced_chunked;
344       } else {
345         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
346           schedule = kmp_sch_guided_simd;
347         }
348         chunk = team->t.t_sched.chunk * chunk;
349       }
350 #if USE_ITT_BUILD
351       if (cur_chunk)
352         *cur_chunk = chunk;
353 #endif
354 #ifdef KMP_DEBUG
355       {
356         char *buff;
357         // create format specifiers before the debug output
358         buff = __kmp_str_format(
359             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
360             " chunk:%%%s\n",
361             traits_t<ST>::spec);
362         KD_TRACE(10, (buff, gtid, schedule, chunk));
363         __kmp_str_free(&buff);
364       }
365 #endif
366     }
367     pr->u.p.parm1 = chunk;
368   }
369   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
370               "unknown scheduling type");
371 
372   pr->u.p.count = 0;
373 
374   if (__kmp_env_consistency_check) {
375     if (st == 0) {
376       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
377                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
378     }
379   }
380   // compute trip count
381   if (st == 1) { // most common case
382     if (ub >= lb) {
383       tc = ub - lb + 1;
384     } else { // ub < lb
385       tc = 0; // zero-trip
386     }
387   } else if (st < 0) {
388     if (lb >= ub) {
389       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
390       // where the division needs to be unsigned regardless of the result type
391       tc = (UT)(lb - ub) / (-st) + 1;
392     } else { // lb < ub
393       tc = 0; // zero-trip
394     }
395   } else { // st > 0
396     if (ub >= lb) {
397       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
398       // where the division needs to be unsigned regardless of the result type
399       tc = (UT)(ub - lb) / st + 1;
400     } else { // ub < lb
401       tc = 0; // zero-trip
402     }
403   }
404 
405 #if KMP_STATS_ENABLED
406   if (KMP_MASTER_GTID(gtid)) {
407     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
408   }
409 #endif
410 
411   pr->u.p.lb = lb;
412   pr->u.p.ub = ub;
413   pr->u.p.st = st;
414   pr->u.p.tc = tc;
415 
416 #if KMP_OS_WINDOWS
417   pr->u.p.last_upper = ub + st;
418 #endif /* KMP_OS_WINDOWS */
419 
420   /* NOTE: only the active parallel region(s) has active ordered sections */
421 
422   if (active) {
423     if (pr->flags.ordered) {
424       pr->ordered_bumped = 0;
425       pr->u.p.ordered_lower = 1;
426       pr->u.p.ordered_upper = 0;
427     }
428   }
429 
430   switch (schedule) {
431 #if KMP_STATIC_STEAL_ENABLED
432   case kmp_sch_static_steal: {
433     T ntc, init = 0;
434 
435     KD_TRACE(100,
436              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
437               gtid));
438 
439     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
440     if (nproc > 1 && ntc >= nproc) {
441       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
442       T id = tid;
443       T small_chunk, extras, p_extra = 0;
444       kmp_uint32 old = UNUSED;
445       int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
446       if (traits_t<T>::type_size > 4) {
447         // AC: TODO: check if 16-byte CAS available and use it to
448         // improve performance (probably wait for explicit request
449         // before spending time on this).
450         // For now use dynamically allocated per-private-buffer lock,
451         // free memory in __kmp_dispatch_next when status==0.
452         pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
453         __kmp_init_lock(pr->u.p.steal_lock);
454       }
455 
456 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
457       // Iterations are divided in a 60/40 skewed distribution among CORE and
458       // ATOM processors for hybrid systems
459       bool use_hybrid = false;
460       kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;
461       T first_thread_with_ecore = 0;
462       T num_procs_with_pcore = 0;
463       T num_procs_with_ecore = 0;
464       T p_ntc = 0, e_ntc = 0;
465       if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
466           __kmp_affinity.type != affinity_explicit) {
467         use_hybrid = true;
468         core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
469         if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
470             __kmp_first_osid_with_ecore > -1) {
471           for (int i = 0; i < team->t.t_nproc; ++i) {
472             kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
473                                           ->th.th_topology_attrs.core_type;
474             int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
475             if (id == __kmp_first_osid_with_ecore) {
476               first_thread_with_ecore =
477                   team->t.t_threads[i]->th.th_info.ds.ds_tid;
478             }
479             if (type == KMP_HW_CORE_TYPE_CORE) {
480               num_procs_with_pcore++;
481             } else if (type == KMP_HW_CORE_TYPE_ATOM) {
482               num_procs_with_ecore++;
483             } else {
484               use_hybrid = false;
485               break;
486             }
487           }
488         }
489         if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {
490           float multiplier = 60.0 / 40.0;
491           float p_ratio = (float)num_procs_with_pcore / nproc;
492           float e_ratio = (float)num_procs_with_ecore / nproc;
493           float e_multiplier =
494               (float)1 /
495               (((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
496           float p_multiplier = multiplier * e_multiplier;
497           p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
498           if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))
499             e_ntc =
500                 (int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));
501           else
502             e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
503           KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);
504 
505           // Use regular static steal if not enough chunks for skewed
506           // distribution
507           use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
508                                        e_ntc >= num_procs_with_ecore)
509                             ? true
510                             : false);
511         } else {
512           use_hybrid = false;
513         }
514       }
515       pr->flags.use_hybrid = use_hybrid;
516       pr->u.p.pchunks = p_ntc;
517       pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
518       pr->u.p.first_thread_with_ecore = first_thread_with_ecore;
519 
520       if (use_hybrid) {
521         KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
522         T big_chunk = p_ntc / num_procs_with_pcore;
523         small_chunk = e_ntc / num_procs_with_ecore;
524 
525         extras =
526             (p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);
527 
528         p_extra = (big_chunk - small_chunk);
529 
530         if (core_type == KMP_HW_CORE_TYPE_CORE) {
531           if (id < first_thread_with_ecore) {
532             init =
533                 id * small_chunk + id * p_extra + (id < extras ? id : extras);
534           } else {
535             init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
536                    (id < extras ? id : extras);
537           }
538         } else {
539           if (id == first_thread_with_ecore) {
540             init =
541                 id * small_chunk + id * p_extra + (id < extras ? id : extras);
542           } else {
543             init = id * small_chunk + first_thread_with_ecore * p_extra +
544                    (id < extras ? id : extras);
545           }
546         }
547         p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
548       } else
549 #endif
550       {
551         small_chunk = ntc / nproc;
552         extras = ntc % nproc;
553         init = id * small_chunk + (id < extras ? id : extras);
554         p_extra = 0;
555       }
556       pr->u.p.count = init;
557       if (claimed) { // are we succeeded in claiming own buffer?
558         pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
559         // Other threads will inspect steal_flag when searching for a victim.
560         // READY means other threads may steal from this thread from now on.
561         KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
562       } else {
563         // other thread has stolen whole our range
564         KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
565         pr->u.p.ub = init; // mark there is no iterations to work on
566       }
567       pr->u.p.parm2 = ntc; // save number of chunks
568       // parm3 is the number of times to attempt stealing which is
569       // nproc (just a heuristics, could be optimized later on).
570       pr->u.p.parm3 = nproc;
571       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
572       break;
573     } else {
574       /* too few chunks: switching to kmp_sch_dynamic_chunked */
575       schedule = kmp_sch_dynamic_chunked;
576       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
577                      "kmp_sch_dynamic_chunked\n",
578                      gtid));
579       goto dynamic_init;
580       break;
581     } // if
582   } // case
583 #endif
584   case kmp_sch_static_balanced: {
585     T init, limit;
586 
587     KD_TRACE(
588         100,
589         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
590          gtid));
591 
592     if (nproc > 1) {
593       T id = tid;
594 
595       if (tc < nproc) {
596         if (id < tc) {
597           init = id;
598           limit = id;
599           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
600         } else {
601           pr->u.p.count = 1; /* means no more chunks to execute */
602           pr->u.p.parm1 = FALSE;
603           break;
604         }
605       } else {
606         T small_chunk = tc / nproc;
607         T extras = tc % nproc;
608         init = id * small_chunk + (id < extras ? id : extras);
609         limit = init + small_chunk - (id < extras ? 0 : 1);
610         pr->u.p.parm1 = (id == nproc - 1);
611       }
612     } else {
613       if (tc > 0) {
614         init = 0;
615         limit = tc - 1;
616         pr->u.p.parm1 = TRUE;
617       } else {
618         // zero trip count
619         pr->u.p.count = 1; /* means no more chunks to execute */
620         pr->u.p.parm1 = FALSE;
621         break;
622       }
623     }
624 #if USE_ITT_BUILD
625     // Calculate chunk for metadata report
626     if (itt_need_metadata_reporting)
627       if (cur_chunk)
628         *cur_chunk = limit - init + 1;
629 #endif
630     if (st == 1) {
631       pr->u.p.lb = lb + init;
632       pr->u.p.ub = lb + limit;
633     } else {
634       // calculated upper bound, "ub" is user-defined upper bound
635       T ub_tmp = lb + limit * st;
636       pr->u.p.lb = lb + init * st;
637       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
638       // it exactly
639       if (st > 0) {
640         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
641       } else {
642         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
643       }
644     }
645     if (pr->flags.ordered) {
646       pr->u.p.ordered_lower = init;
647       pr->u.p.ordered_upper = limit;
648     }
649     break;
650   } // case
651   case kmp_sch_static_balanced_chunked: {
652     // similar to balanced, but chunk adjusted to multiple of simd width
653     T nth = nproc;
654     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
655                    " -> falling-through to static_greedy\n",
656                    gtid));
657     schedule = kmp_sch_static_greedy;
658     if (nth > 1)
659       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
660     else
661       pr->u.p.parm1 = tc;
662     break;
663   } // case
664   case kmp_sch_guided_simd:
665   case kmp_sch_guided_iterative_chunked: {
666     KD_TRACE(
667         100,
668         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
669          " case\n",
670          gtid));
671 
672     if (nproc > 1) {
673       if ((2L * chunk + 1) * nproc >= tc) {
674         /* chunk size too large, switch to dynamic */
675         schedule = kmp_sch_dynamic_chunked;
676         goto dynamic_init;
677       } else {
678         // when remaining iters become less than parm2 - switch to dynamic
679         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
680         *(double *)&pr->u.p.parm3 =
681             guided_flt_param / (double)nproc; // may occupy parm3 and parm4
682       }
683     } else {
684       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
685                      "kmp_sch_static_greedy\n",
686                      gtid));
687       schedule = kmp_sch_static_greedy;
688       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
689       KD_TRACE(
690           100,
691           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
692            gtid));
693       pr->u.p.parm1 = tc;
694     } // if
695   } // case
696   break;
697   case kmp_sch_guided_analytical_chunked: {
698     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
699                    "kmp_sch_guided_analytical_chunked case\n",
700                    gtid));
701 
702     if (nproc > 1) {
703       if ((2L * chunk + 1) * nproc >= tc) {
704         /* chunk size too large, switch to dynamic */
705         schedule = kmp_sch_dynamic_chunked;
706         goto dynamic_init;
707       } else {
708         /* commonly used term: (2 nproc - 1)/(2 nproc) */
709         DBL x;
710 
711 #if KMP_USE_X87CONTROL
712         /* Linux* OS already has 64-bit computation by default for long double,
713            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
714            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
715            instead of the default 53-bit. Even though long double doesn't work
716            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
717            expected to impact the correctness of the algorithm, but this has not
718            been mathematically proven. */
719         // save original FPCW and set precision to 64-bit, as
720         // Windows* OS on IA-32 architecture defaults to 53-bit
721         unsigned int oldFpcw = _control87(0, 0);
722         _control87(_PC_64, _MCW_PC); // 0,0x30000
723 #endif
724         /* value used for comparison in solver for cross-over point */
725         KMP_ASSERT(tc > 0);
726         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
727 
728         /* crossover point--chunk indexes equal to or greater than
729            this point switch to dynamic-style scheduling */
730         UT cross;
731 
732         /* commonly used term: (2 nproc - 1)/(2 nproc) */
733         x = 1.0 - 0.5 / (double)nproc;
734 
735 #ifdef KMP_DEBUG
736         { // test natural alignment
737           struct _test_a {
738             char a;
739             union {
740               char b;
741               DBL d;
742             };
743           } t;
744           ptrdiff_t natural_alignment =
745               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
746           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
747           // long)natural_alignment );
748           KMP_DEBUG_ASSERT(
749               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
750         }
751 #endif // KMP_DEBUG
752 
753         /* save the term in thread private dispatch structure */
754         *(DBL *)&pr->u.p.parm3 = x;
755 
756         /* solve for the crossover point to the nearest integer i for which C_i
757            <= chunk */
758         {
759           UT left, right, mid;
760           long double p;
761 
762           /* estimate initial upper and lower bound */
763 
764           /* doesn't matter what value right is as long as it is positive, but
765              it affects performance of the solver */
766           right = 229;
767           p = __kmp_pow<UT>(x, right);
768           if (p > target) {
769             do {
770               p *= p;
771               right <<= 1;
772             } while (p > target && right < (1 << 27));
773             /* lower bound is previous (failed) estimate of upper bound */
774             left = right >> 1;
775           } else {
776             left = 0;
777           }
778 
779           /* bisection root-finding method */
780           while (left + 1 < right) {
781             mid = (left + right) / 2;
782             if (__kmp_pow<UT>(x, mid) > target) {
783               left = mid;
784             } else {
785               right = mid;
786             }
787           } // while
788           cross = right;
789         }
790         /* assert sanity of computed crossover point */
791         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
792                    __kmp_pow<UT>(x, cross) <= target);
793 
794         /* save the crossover point in thread private dispatch structure */
795         pr->u.p.parm2 = cross;
796 
797 // C75803
798 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
799 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
800 #else
801 #define GUIDED_ANALYTICAL_WORKAROUND (x)
802 #endif
803         /* dynamic-style scheduling offset */
804         pr->u.p.count = tc -
805                         __kmp_dispatch_guided_remaining(
806                             tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
807                         cross * chunk;
808 #if KMP_USE_X87CONTROL
809         // restore FPCW
810         _control87(oldFpcw, _MCW_PC);
811 #endif
812       } // if
813     } else {
814       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
815                      "kmp_sch_static_greedy\n",
816                      gtid));
817       schedule = kmp_sch_static_greedy;
818       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
819       pr->u.p.parm1 = tc;
820     } // if
821   } // case
822   break;
823   case kmp_sch_static_greedy:
824     KD_TRACE(
825         100,
826         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
827          gtid));
828     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
829     break;
830   case kmp_sch_static_chunked:
831   case kmp_sch_dynamic_chunked:
832   dynamic_init:
833     if (tc == 0)
834       break;
835     if (pr->u.p.parm1 <= 0)
836       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
837     else if (pr->u.p.parm1 > tc)
838       pr->u.p.parm1 = tc;
839     // Store the total number of chunks to prevent integer overflow during
840     // bounds calculations in the get next chunk routine.
841     pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
842     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
843                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
844                    gtid));
845     break;
846   case kmp_sch_trapezoidal: {
847     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
848 
849     T parm1, parm2, parm3, parm4;
850     KD_TRACE(100,
851              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
852               gtid));
853 
854     parm1 = chunk;
855 
856     /* F : size of the first cycle */
857     parm2 = (tc / (2 * nproc));
858 
859     if (parm2 < 1) {
860       parm2 = 1;
861     }
862 
863     /* L : size of the last cycle.  Make sure the last cycle is not larger
864        than the first cycle. */
865     if (parm1 < 1) {
866       parm1 = 1;
867     } else if (parm1 > parm2) {
868       parm1 = parm2;
869     }
870 
871     /* N : number of cycles */
872     parm3 = (parm2 + parm1);
873     parm3 = (2 * tc + parm3 - 1) / parm3;
874 
875     if (parm3 < 2) {
876       parm3 = 2;
877     }
878 
879     /* sigma : decreasing incr of the trapezoid */
880     parm4 = (parm3 - 1);
881     parm4 = (parm2 - parm1) / parm4;
882 
883     // pointless check, because parm4 >= 0 always
884     // if ( parm4 < 0 ) {
885     //    parm4 = 0;
886     //}
887 
888     pr->u.p.parm1 = parm1;
889     pr->u.p.parm2 = parm2;
890     pr->u.p.parm3 = parm3;
891     pr->u.p.parm4 = parm4;
892   } // case
893   break;
894 
895   default: {
896     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
897                 KMP_HNT(GetNewerLibrary), // Hint
898                 __kmp_msg_null // Variadic argument list terminator
899     );
900   } break;
901   } // switch
902   pr->schedule = schedule;
903 }
904 
905 #if KMP_USE_HIER_SCHED
906 template <typename T>
907 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
908                                              typename traits_t<T>::signed_t st);
909 template <>
910 inline void
911 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
912                                             kmp_int32 ub, kmp_int32 st) {
913   __kmp_dispatch_init_hierarchy<kmp_int32>(
914       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
915       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
916 }
917 template <>
918 inline void
919 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
920                                              kmp_uint32 ub, kmp_int32 st) {
921   __kmp_dispatch_init_hierarchy<kmp_uint32>(
922       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
923       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
924 }
925 template <>
926 inline void
927 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
928                                             kmp_int64 ub, kmp_int64 st) {
929   __kmp_dispatch_init_hierarchy<kmp_int64>(
930       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
931       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
932 }
933 template <>
934 inline void
935 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
936                                              kmp_uint64 ub, kmp_int64 st) {
937   __kmp_dispatch_init_hierarchy<kmp_uint64>(
938       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
939       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
940 }
941 
942 // free all the hierarchy scheduling memory associated with the team
943 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
944   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
945   for (int i = 0; i < num_disp_buff; ++i) {
946     // type does not matter here so use kmp_int32
947     auto sh =
948         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
949             &team->t.t_disp_buffer[i]);
950     if (sh->hier) {
951       sh->hier->deallocate();
952       __kmp_free(sh->hier);
953     }
954   }
955 }
956 #endif
957 
958 // UT - unsigned flavor of T, ST - signed flavor of T,
959 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
960 template <typename T>
961 static void
962 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
963                     T ub, typename traits_t<T>::signed_t st,
964                     typename traits_t<T>::signed_t chunk, int push_ws) {
965   typedef typename traits_t<T>::unsigned_t UT;
966 
967   int active;
968   kmp_info_t *th;
969   kmp_team_t *team;
970   kmp_uint32 my_buffer_index;
971   dispatch_private_info_template<T> *pr;
972   dispatch_shared_info_template<T> volatile *sh;
973 
974   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
975                    sizeof(dispatch_private_info));
976   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
977                    sizeof(dispatch_shared_info));
978   __kmp_assert_valid_gtid(gtid);
979 
980   if (!TCR_4(__kmp_init_parallel))
981     __kmp_parallel_initialize();
982 
983   __kmp_resume_if_soft_paused();
984 
985 #if INCLUDE_SSC_MARKS
986   SSC_MARK_DISPATCH_INIT();
987 #endif
988 #ifdef KMP_DEBUG
989   typedef typename traits_t<T>::signed_t ST;
990   {
991     char *buff;
992     // create format specifiers before the debug output
993     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
994                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
995                             traits_t<ST>::spec, traits_t<T>::spec,
996                             traits_t<T>::spec, traits_t<ST>::spec);
997     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
998     __kmp_str_free(&buff);
999   }
1000 #endif
1001   /* setup data */
1002   th = __kmp_threads[gtid];
1003   team = th->th.th_team;
1004   active = !team->t.t_serialized;
1005   th->th.th_ident = loc;
1006 
1007   // Any half-decent optimizer will remove this test when the blocks are empty
1008   // since the macros expand to nothing
1009   // when statistics are disabled.
1010   if (schedule == __kmp_static) {
1011     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
1012   } else {
1013     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
1014   }
1015 
1016 #if KMP_USE_HIER_SCHED
1017   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
1018   // Hierarchical scheduling does not work with ordered, so if ordered is
1019   // detected, then revert back to threaded scheduling.
1020   bool ordered;
1021   enum sched_type my_sched = schedule;
1022   my_buffer_index = th->th.th_dispatch->th_disp_index;
1023   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1024       &th->th.th_dispatch
1025            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1026   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
1027   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
1028     my_sched =
1029         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
1030   ordered = (kmp_ord_lower & my_sched);
1031   if (pr->flags.use_hier) {
1032     if (ordered) {
1033       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
1034                      "Disabling hierarchical scheduling.\n",
1035                      gtid));
1036       pr->flags.use_hier = FALSE;
1037     }
1038   }
1039   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
1040     // Don't use hierarchical for ordered parallel loops and don't
1041     // use the runtime hierarchy if one was specified in the program
1042     if (!ordered && !pr->flags.use_hier)
1043       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
1044   }
1045 #endif // KMP_USE_HIER_SCHED
1046 
1047 #if USE_ITT_BUILD
1048   kmp_uint64 cur_chunk = chunk;
1049   int itt_need_metadata_reporting =
1050       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
1051       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
1052       team->t.t_active_level == 1;
1053 #endif
1054   if (!active) {
1055     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1056         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1057   } else {
1058     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1059                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1060 
1061     my_buffer_index = th->th.th_dispatch->th_disp_index++;
1062 
1063     /* What happens when number of threads changes, need to resize buffer? */
1064     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1065         &th->th.th_dispatch
1066              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1067     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
1068         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1069     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
1070                   my_buffer_index));
1071     if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
1072       KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
1073                      " sh->buffer_index:%d\n",
1074                      gtid, my_buffer_index, sh->buffer_index));
1075       __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1076                              __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1077       // Note: KMP_WAIT() cannot be used there: buffer index and
1078       // my_buffer_index are *always* 32-bit integers.
1079       KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1080                      "sh->buffer_index:%d\n",
1081                      gtid, my_buffer_index, sh->buffer_index));
1082     }
1083   }
1084 
1085   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
1086 #if USE_ITT_BUILD
1087                                 &cur_chunk,
1088 #endif
1089                                 chunk, (T)th->th.th_team_nproc,
1090                                 (T)th->th.th_info.ds.ds_tid);
1091   if (active) {
1092     if (pr->flags.ordered == 0) {
1093       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
1094       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
1095     } else {
1096       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
1097       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
1098     }
1099     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1100     th->th.th_dispatch->th_dispatch_sh_current =
1101         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
1102 #if USE_ITT_BUILD
1103     if (pr->flags.ordered) {
1104       __kmp_itt_ordered_init(gtid);
1105     }
1106     // Report loop metadata
1107     if (itt_need_metadata_reporting) {
1108       // Only report metadata by primary thread of active team at level 1
1109       kmp_uint64 schedtype = 0;
1110       switch (schedule) {
1111       case kmp_sch_static_chunked:
1112       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1113         break;
1114       case kmp_sch_static_greedy:
1115         cur_chunk = pr->u.p.parm1;
1116         break;
1117       case kmp_sch_dynamic_chunked:
1118         schedtype = 1;
1119         break;
1120       case kmp_sch_guided_iterative_chunked:
1121       case kmp_sch_guided_analytical_chunked:
1122       case kmp_sch_guided_simd:
1123         schedtype = 2;
1124         break;
1125       default:
1126         // Should we put this case under "static"?
1127         // case kmp_sch_static_steal:
1128         schedtype = 3;
1129         break;
1130       }
1131       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
1132     }
1133 #if KMP_USE_HIER_SCHED
1134     if (pr->flags.use_hier) {
1135       pr->u.p.count = 0;
1136       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
1137     }
1138 #endif // KMP_USER_HIER_SCHED
1139 #endif /* USE_ITT_BUILD */
1140   }
1141 
1142 #ifdef KMP_DEBUG
1143   {
1144     char *buff;
1145     // create format specifiers before the debug output
1146     buff = __kmp_str_format(
1147         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1148         "lb:%%%s ub:%%%s"
1149         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1150         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1151         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1152         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1153         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1154         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1155     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
1156                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1157                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1158                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
1159     __kmp_str_free(&buff);
1160   }
1161 #endif
1162 #if OMPT_SUPPORT && OMPT_OPTIONAL
1163   if (ompt_enabled.ompt_callback_work) {
1164     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1165     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1166     ompt_callbacks.ompt_callback(ompt_callback_work)(
1167         ompt_get_work_schedule(pr->schedule), ompt_scope_begin,
1168         &(team_info->parallel_data), &(task_info->task_data), pr->u.p.tc,
1169         OMPT_LOAD_RETURN_ADDRESS(gtid));
1170   }
1171 #endif
1172   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1173 }
1174 
1175 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1176  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1177  * every chunk of iterations.  If the ordered section(s) were not executed
1178  * for this iteration (or every iteration in this chunk), we need to set the
1179  * ordered iteration counters so that the next thread can proceed. */
1180 template <typename UT>
1181 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1182   typedef typename traits_t<UT>::signed_t ST;
1183   __kmp_assert_valid_gtid(gtid);
1184   kmp_info_t *th = __kmp_threads[gtid];
1185 
1186   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1187   if (!th->th.th_team->t.t_serialized) {
1188 
1189     dispatch_private_info_template<UT> *pr =
1190         reinterpret_cast<dispatch_private_info_template<UT> *>(
1191             th->th.th_dispatch->th_dispatch_pr_current);
1192     dispatch_shared_info_template<UT> volatile *sh =
1193         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1194             th->th.th_dispatch->th_dispatch_sh_current);
1195     KMP_DEBUG_ASSERT(pr);
1196     KMP_DEBUG_ASSERT(sh);
1197     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1198                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1199 
1200     if (pr->ordered_bumped) {
1201       KD_TRACE(
1202           1000,
1203           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1204            gtid));
1205       pr->ordered_bumped = 0;
1206     } else {
1207       UT lower = pr->u.p.ordered_lower;
1208 
1209 #ifdef KMP_DEBUG
1210       {
1211         char *buff;
1212         // create format specifiers before the debug output
1213         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1214                                 "ordered_iteration:%%%s lower:%%%s\n",
1215                                 traits_t<UT>::spec, traits_t<UT>::spec);
1216         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1217         __kmp_str_free(&buff);
1218       }
1219 #endif
1220 
1221       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1222                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1223       KMP_MB(); /* is this necessary? */
1224 #ifdef KMP_DEBUG
1225       {
1226         char *buff;
1227         // create format specifiers before the debug output
1228         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1229                                 "ordered_iteration:%%%s lower:%%%s\n",
1230                                 traits_t<UT>::spec, traits_t<UT>::spec);
1231         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1232         __kmp_str_free(&buff);
1233       }
1234 #endif
1235 
1236       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1237     } // if
1238   } // if
1239   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1240 }
1241 
1242 #ifdef KMP_GOMP_COMPAT
1243 
1244 template <typename UT>
1245 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1246   typedef typename traits_t<UT>::signed_t ST;
1247   __kmp_assert_valid_gtid(gtid);
1248   kmp_info_t *th = __kmp_threads[gtid];
1249 
1250   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1251   if (!th->th.th_team->t.t_serialized) {
1252     dispatch_private_info_template<UT> *pr =
1253         reinterpret_cast<dispatch_private_info_template<UT> *>(
1254             th->th.th_dispatch->th_dispatch_pr_current);
1255     dispatch_shared_info_template<UT> volatile *sh =
1256         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1257             th->th.th_dispatch->th_dispatch_sh_current);
1258     KMP_DEBUG_ASSERT(pr);
1259     KMP_DEBUG_ASSERT(sh);
1260     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1261                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1262 
1263     UT lower = pr->u.p.ordered_lower;
1264     UT upper = pr->u.p.ordered_upper;
1265     UT inc = upper - lower + 1;
1266 
1267     if (pr->ordered_bumped == inc) {
1268       KD_TRACE(
1269           1000,
1270           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1271            gtid));
1272       pr->ordered_bumped = 0;
1273     } else {
1274       inc -= pr->ordered_bumped;
1275 
1276 #ifdef KMP_DEBUG
1277       {
1278         char *buff;
1279         // create format specifiers before the debug output
1280         buff = __kmp_str_format(
1281             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1282             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1283             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1284         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1285         __kmp_str_free(&buff);
1286       }
1287 #endif
1288 
1289       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1290                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1291 
1292       KMP_MB(); /* is this necessary? */
1293       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1294                       "ordered_bumped to zero\n",
1295                       gtid));
1296       pr->ordered_bumped = 0;
1297 //!!!!! TODO check if the inc should be unsigned, or signed???
1298 #ifdef KMP_DEBUG
1299       {
1300         char *buff;
1301         // create format specifiers before the debug output
1302         buff = __kmp_str_format(
1303             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1304             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1305             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1306             traits_t<UT>::spec);
1307         KD_TRACE(1000,
1308                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1309         __kmp_str_free(&buff);
1310       }
1311 #endif
1312 
1313       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1314     }
1315     //        }
1316   }
1317   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1318 }
1319 
1320 #endif /* KMP_GOMP_COMPAT */
1321 
1322 template <typename T>
1323 int __kmp_dispatch_next_algorithm(int gtid,
1324                                   dispatch_private_info_template<T> *pr,
1325                                   dispatch_shared_info_template<T> volatile *sh,
1326                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1327                                   typename traits_t<T>::signed_t *p_st, T nproc,
1328                                   T tid) {
1329   typedef typename traits_t<T>::unsigned_t UT;
1330   typedef typename traits_t<T>::signed_t ST;
1331   typedef typename traits_t<T>::floating_t DBL;
1332   int status = 0;
1333   bool last = false;
1334   T start;
1335   ST incr;
1336   UT limit, trip, init;
1337   kmp_info_t *th = __kmp_threads[gtid];
1338   kmp_team_t *team = th->th.th_team;
1339 
1340   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1341                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1342   KMP_DEBUG_ASSERT(pr);
1343   KMP_DEBUG_ASSERT(sh);
1344   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1345 #ifdef KMP_DEBUG
1346   {
1347     char *buff;
1348     // create format specifiers before the debug output
1349     buff =
1350         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1351                          "sh:%%p nproc:%%%s tid:%%%s\n",
1352                          traits_t<T>::spec, traits_t<T>::spec);
1353     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1354     __kmp_str_free(&buff);
1355   }
1356 #endif
1357 
1358   // zero trip count
1359   if (pr->u.p.tc == 0) {
1360     KD_TRACE(10,
1361              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1362               "zero status:%d\n",
1363               gtid, status));
1364     return 0;
1365   }
1366 
1367   switch (pr->schedule) {
1368 #if KMP_STATIC_STEAL_ENABLED
1369   case kmp_sch_static_steal: {
1370     T chunk = pr->u.p.parm1;
1371     UT nchunks = pr->u.p.parm2;
1372     KD_TRACE(100,
1373              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1374               gtid));
1375 
1376     trip = pr->u.p.tc - 1;
1377 
1378     if (traits_t<T>::type_size > 4) {
1379       // use lock for 8-byte induction variable.
1380       // TODO (optional): check presence and use 16-byte CAS
1381       kmp_lock_t *lck = pr->u.p.steal_lock;
1382       KMP_DEBUG_ASSERT(lck != NULL);
1383       if (pr->u.p.count < (UT)pr->u.p.ub) {
1384         KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1385         __kmp_acquire_lock(lck, gtid);
1386         // try to get own chunk of iterations
1387         init = (pr->u.p.count)++;
1388         status = (init < (UT)pr->u.p.ub);
1389         __kmp_release_lock(lck, gtid);
1390       } else {
1391         status = 0; // no own chunks
1392       }
1393       if (!status) { // try to steal
1394         kmp_lock_t *lckv; // victim buffer's lock
1395         T while_limit = pr->u.p.parm3;
1396         T while_index = 0;
1397         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1398                   __kmp_dispatch_num_buffers; // current loop index
1399         // note: victim thread can potentially execute another loop
1400         KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1401         while ((!status) && (while_limit != ++while_index)) {
1402           dispatch_private_info_template<T> *v;
1403           T remaining;
1404           T victimId = pr->u.p.parm4;
1405           T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1406           v = reinterpret_cast<dispatch_private_info_template<T> *>(
1407               &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1408           KMP_DEBUG_ASSERT(v);
1409           while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1410                  oldVictimId != victimId) {
1411             victimId = (victimId + 1) % nproc;
1412             v = reinterpret_cast<dispatch_private_info_template<T> *>(
1413                 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1414             KMP_DEBUG_ASSERT(v);
1415           }
1416           if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1417             continue; // try once more (nproc attempts in total)
1418           }
1419           if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1420             kmp_uint32 old = UNUSED;
1421             // try to steal whole range from inactive victim
1422             status = v->steal_flag.compare_exchange_strong(old, THIEF);
1423             if (status) {
1424               // initialize self buffer with victim's whole range of chunks
1425               T id = victimId;
1426               T small_chunk = 0, extras = 0, p_extra = 0;
1427               __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1428                                               init, small_chunk, extras,
1429                                               p_extra);
1430               __kmp_acquire_lock(lck, gtid);
1431               pr->u.p.count = init + 1; // exclude one we execute immediately
1432               pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
1433               __kmp_release_lock(lck, gtid);
1434               pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1435               // no need to reinitialize other thread invariants: lb, st, etc.
1436 #ifdef KMP_DEBUG
1437               {
1438                 char *buff;
1439                 // create format specifiers before the debug output
1440                 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1441                                         "stolen chunks from T#%%d, "
1442                                         "count:%%%s ub:%%%s\n",
1443                                         traits_t<UT>::spec, traits_t<T>::spec);
1444                 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1445                 __kmp_str_free(&buff);
1446               }
1447 #endif
1448               // activate non-empty buffer and let others steal from us
1449               if (pr->u.p.count < (UT)pr->u.p.ub)
1450                 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1451               break;
1452             }
1453           }
1454           if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1455               v->u.p.count >= (UT)v->u.p.ub) {
1456             pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1457             continue; // no chunks to steal, try next victim
1458           }
1459           lckv = v->u.p.steal_lock;
1460           KMP_ASSERT(lckv != NULL);
1461           __kmp_acquire_lock(lckv, gtid);
1462           limit = v->u.p.ub; // keep initial ub
1463           if (v->u.p.count >= limit) {
1464             __kmp_release_lock(lckv, gtid);
1465             pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1466             continue; // no chunks to steal, try next victim
1467           }
1468 
1469           // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1470           // TODO: is this heuristics good enough??
1471           remaining = limit - v->u.p.count;
1472           if (remaining > 7) {
1473             // steal 1/4 of remaining
1474             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1475             init = (v->u.p.ub -= (remaining >> 2));
1476           } else {
1477             // steal 1 chunk of 1..7 remaining
1478             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1479             init = (v->u.p.ub -= 1);
1480           }
1481           __kmp_release_lock(lckv, gtid);
1482 #ifdef KMP_DEBUG
1483           {
1484             char *buff;
1485             // create format specifiers before the debug output
1486             buff = __kmp_str_format(
1487                 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1488                 "count:%%%s ub:%%%s\n",
1489                 traits_t<UT>::spec, traits_t<UT>::spec);
1490             KD_TRACE(10, (buff, gtid, victimId, init, limit));
1491             __kmp_str_free(&buff);
1492           }
1493 #endif
1494           KMP_DEBUG_ASSERT(init + 1 <= limit);
1495           pr->u.p.parm4 = victimId; // remember victim to steal from
1496           status = 1;
1497           // now update own count and ub with stolen range excluding init chunk
1498           __kmp_acquire_lock(lck, gtid);
1499           pr->u.p.count = init + 1;
1500           pr->u.p.ub = limit;
1501           __kmp_release_lock(lck, gtid);
1502           // activate non-empty buffer and let others steal from us
1503           if (init + 1 < limit)
1504             KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1505         } // while (search for victim)
1506       } // if (try to find victim and steal)
1507     } else {
1508       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1509       // as all operations on pair (count, ub) must be done atomically
1510       typedef union {
1511         struct {
1512           UT count;
1513           T ub;
1514         } p;
1515         kmp_int64 b;
1516       } union_i4;
1517       union_i4 vold, vnew;
1518       if (pr->u.p.count < (UT)pr->u.p.ub) {
1519         KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1520         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1521         vnew.b = vold.b;
1522         vnew.p.count++; // get chunk from head of self range
1523         while (!KMP_COMPARE_AND_STORE_REL64(
1524             (volatile kmp_int64 *)&pr->u.p.count,
1525             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1526             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1527           KMP_CPU_PAUSE();
1528           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1529           vnew.b = vold.b;
1530           vnew.p.count++;
1531         }
1532         init = vold.p.count;
1533         status = (init < (UT)vold.p.ub);
1534       } else {
1535         status = 0; // no own chunks
1536       }
1537       if (!status) { // try to steal
1538         T while_limit = pr->u.p.parm3;
1539         T while_index = 0;
1540         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1541                   __kmp_dispatch_num_buffers; // current loop index
1542         // note: victim thread can potentially execute another loop
1543         KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1544         while ((!status) && (while_limit != ++while_index)) {
1545           dispatch_private_info_template<T> *v;
1546           T remaining;
1547           T victimId = pr->u.p.parm4;
1548           T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1549           v = reinterpret_cast<dispatch_private_info_template<T> *>(
1550               &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1551           KMP_DEBUG_ASSERT(v);
1552           while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1553                  oldVictimId != victimId) {
1554             victimId = (victimId + 1) % nproc;
1555             v = reinterpret_cast<dispatch_private_info_template<T> *>(
1556                 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1557             KMP_DEBUG_ASSERT(v);
1558           }
1559           if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1560             continue; // try once more (nproc attempts in total)
1561           }
1562           if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1563             kmp_uint32 old = UNUSED;
1564             // try to steal whole range from inactive victim
1565             status = v->steal_flag.compare_exchange_strong(old, THIEF);
1566             if (status) {
1567               // initialize self buffer with victim's whole range of chunks
1568               T id = victimId;
1569               T small_chunk = 0, extras = 0, p_extra = 0;
1570               __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1571                                               init, small_chunk, extras,
1572                                               p_extra);
1573               vnew.p.count = init + 1;
1574               vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
1575               // write pair (count, ub) at once atomically
1576 #if KMP_ARCH_X86
1577               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1578 #else
1579               *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1580 #endif
1581               pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1582               // no need to initialize other thread invariants: lb, st, etc.
1583 #ifdef KMP_DEBUG
1584               {
1585                 char *buff;
1586                 // create format specifiers before the debug output
1587                 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1588                                         "stolen chunks from T#%%d, "
1589                                         "count:%%%s ub:%%%s\n",
1590                                         traits_t<UT>::spec, traits_t<T>::spec);
1591                 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1592                 __kmp_str_free(&buff);
1593               }
1594 #endif
1595               // activate non-empty buffer and let others steal from us
1596               if (pr->u.p.count < (UT)pr->u.p.ub)
1597                 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1598               break;
1599             }
1600           }
1601           while (1) { // CAS loop with check if victim still has enough chunks
1602             // many threads may be stealing concurrently from same victim
1603             vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
1604             if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1605                 vold.p.count >= (UT)vold.p.ub) {
1606               pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
1607               break; // no chunks to steal, try next victim
1608             }
1609             vnew.b = vold.b;
1610             remaining = vold.p.ub - vold.p.count;
1611             // try to steal 1/4 of remaining
1612             // TODO: is this heuristics good enough??
1613             if (remaining > 7) {
1614               vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
1615             } else {
1616               vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
1617             }
1618             KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1619             if (KMP_COMPARE_AND_STORE_REL64(
1620                     (volatile kmp_int64 *)&v->u.p.count,
1621                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1622                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1623               // stealing succedded
1624 #ifdef KMP_DEBUG
1625               {
1626                 char *buff;
1627                 // create format specifiers before the debug output
1628                 buff = __kmp_str_format(
1629                     "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1630                     "count:%%%s ub:%%%s\n",
1631                     traits_t<T>::spec, traits_t<T>::spec);
1632                 KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1633                 __kmp_str_free(&buff);
1634               }
1635 #endif
1636               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1637                                         vold.p.ub - vnew.p.ub);
1638               status = 1;
1639               pr->u.p.parm4 = victimId; // keep victim id
1640               // now update own count and ub
1641               init = vnew.p.ub;
1642               vold.p.count = init + 1;
1643 #if KMP_ARCH_X86
1644               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1645 #else
1646               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1647 #endif
1648               // activate non-empty buffer and let others steal from us
1649               if (vold.p.count < (UT)vold.p.ub)
1650                 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1651               break;
1652             } // if (check CAS result)
1653             KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1654           } // while (try to steal from particular victim)
1655         } // while (search for victim)
1656       } // if (try to find victim and steal)
1657     } // if (4-byte induction variable)
1658     if (!status) {
1659       *p_lb = 0;
1660       *p_ub = 0;
1661       if (p_st != NULL)
1662         *p_st = 0;
1663     } else {
1664       start = pr->u.p.lb;
1665       init *= chunk;
1666       limit = chunk + init - 1;
1667       incr = pr->u.p.st;
1668       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1669 
1670       KMP_DEBUG_ASSERT(init <= trip);
1671       // keep track of done chunks for possible early exit from stealing
1672       // TODO: count executed chunks locally with rare update of shared location
1673       // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1674       if ((last = (limit >= trip)) != 0)
1675         limit = trip;
1676       if (p_st != NULL)
1677         *p_st = incr;
1678 
1679       if (incr == 1) {
1680         *p_lb = start + init;
1681         *p_ub = start + limit;
1682       } else {
1683         *p_lb = start + init * incr;
1684         *p_ub = start + limit * incr;
1685       }
1686     } // if
1687     break;
1688   } // case
1689 #endif // KMP_STATIC_STEAL_ENABLED
1690   case kmp_sch_static_balanced: {
1691     KD_TRACE(
1692         10,
1693         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1694          gtid));
1695     /* check if thread has any iteration to do */
1696     if ((status = !pr->u.p.count) != 0) {
1697       pr->u.p.count = 1;
1698       *p_lb = pr->u.p.lb;
1699       *p_ub = pr->u.p.ub;
1700       last = (pr->u.p.parm1 != 0);
1701       if (p_st != NULL)
1702         *p_st = pr->u.p.st;
1703     } else { /* no iterations to do */
1704       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1705     }
1706   } // case
1707   break;
1708   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1709                                  merged here */
1710   case kmp_sch_static_chunked: {
1711     T parm1;
1712 
1713     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1714                    "kmp_sch_static_[affinity|chunked] case\n",
1715                    gtid));
1716     parm1 = pr->u.p.parm1;
1717 
1718     trip = pr->u.p.tc - 1;
1719     init = parm1 * (pr->u.p.count + tid);
1720 
1721     if ((status = (init <= trip)) != 0) {
1722       start = pr->u.p.lb;
1723       incr = pr->u.p.st;
1724       limit = parm1 + init - 1;
1725 
1726       if ((last = (limit >= trip)) != 0)
1727         limit = trip;
1728 
1729       if (p_st != NULL)
1730         *p_st = incr;
1731 
1732       pr->u.p.count += nproc;
1733 
1734       if (incr == 1) {
1735         *p_lb = start + init;
1736         *p_ub = start + limit;
1737       } else {
1738         *p_lb = start + init * incr;
1739         *p_ub = start + limit * incr;
1740       }
1741 
1742       if (pr->flags.ordered) {
1743         pr->u.p.ordered_lower = init;
1744         pr->u.p.ordered_upper = limit;
1745       } // if
1746     } // if
1747   } // case
1748   break;
1749 
1750   case kmp_sch_dynamic_chunked: {
1751     UT chunk_number;
1752     UT chunk_size = pr->u.p.parm1;
1753     UT nchunks = pr->u.p.parm2;
1754 
1755     KD_TRACE(
1756         100,
1757         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1758          gtid));
1759 
1760     chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1761     status = (chunk_number < nchunks);
1762     if (!status) {
1763       *p_lb = 0;
1764       *p_ub = 0;
1765       if (p_st != NULL)
1766         *p_st = 0;
1767     } else {
1768       init = chunk_size * chunk_number;
1769       trip = pr->u.p.tc - 1;
1770       start = pr->u.p.lb;
1771       incr = pr->u.p.st;
1772 
1773       if ((last = (trip - init < (UT)chunk_size)))
1774         limit = trip;
1775       else
1776         limit = chunk_size + init - 1;
1777 
1778       if (p_st != NULL)
1779         *p_st = incr;
1780 
1781       if (incr == 1) {
1782         *p_lb = start + init;
1783         *p_ub = start + limit;
1784       } else {
1785         *p_lb = start + init * incr;
1786         *p_ub = start + limit * incr;
1787       }
1788 
1789       if (pr->flags.ordered) {
1790         pr->u.p.ordered_lower = init;
1791         pr->u.p.ordered_upper = limit;
1792       } // if
1793     } // if
1794   } // case
1795   break;
1796 
1797   case kmp_sch_guided_iterative_chunked: {
1798     T chunkspec = pr->u.p.parm1;
1799     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1800                    "iterative case\n",
1801                    gtid));
1802     trip = pr->u.p.tc;
1803     // Start atomic part of calculations
1804     while (1) {
1805       ST remaining; // signed, because can be < 0
1806       init = sh->u.s.iteration; // shared value
1807       remaining = trip - init;
1808       if (remaining <= 0) { // AC: need to compare with 0 first
1809         // nothing to do, don't try atomic op
1810         status = 0;
1811         break;
1812       }
1813       if ((T)remaining <
1814           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1815         // use dynamic-style schedule
1816         // atomically increment iterations, get old value
1817         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1818                                  (ST)chunkspec);
1819         remaining = trip - init;
1820         if (remaining <= 0) {
1821           status = 0; // all iterations got by other threads
1822         } else {
1823           // got some iterations to work on
1824           status = 1;
1825           if ((T)remaining > chunkspec) {
1826             limit = init + chunkspec - 1;
1827           } else {
1828             last = true; // the last chunk
1829             limit = init + remaining - 1;
1830           } // if
1831         } // if
1832         break;
1833       } // if
1834       limit = init + (UT)((double)remaining *
1835                           *(double *)&pr->u.p.parm3); // divide by K*nproc
1836       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1837                                (ST)init, (ST)limit)) {
1838         // CAS was successful, chunk obtained
1839         status = 1;
1840         --limit;
1841         break;
1842       } // if
1843     } // while
1844     if (status != 0) {
1845       start = pr->u.p.lb;
1846       incr = pr->u.p.st;
1847       if (p_st != NULL)
1848         *p_st = incr;
1849       *p_lb = start + init * incr;
1850       *p_ub = start + limit * incr;
1851       if (pr->flags.ordered) {
1852         pr->u.p.ordered_lower = init;
1853         pr->u.p.ordered_upper = limit;
1854       } // if
1855     } else {
1856       *p_lb = 0;
1857       *p_ub = 0;
1858       if (p_st != NULL)
1859         *p_st = 0;
1860     } // if
1861   } // case
1862   break;
1863 
1864   case kmp_sch_guided_simd: {
1865     // same as iterative but curr-chunk adjusted to be multiple of given
1866     // chunk
1867     T chunk = pr->u.p.parm1;
1868     KD_TRACE(100,
1869              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1870               gtid));
1871     trip = pr->u.p.tc;
1872     // Start atomic part of calculations
1873     while (1) {
1874       ST remaining; // signed, because can be < 0
1875       init = sh->u.s.iteration; // shared value
1876       remaining = trip - init;
1877       if (remaining <= 0) { // AC: need to compare with 0 first
1878         status = 0; // nothing to do, don't try atomic op
1879         break;
1880       }
1881       KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
1882       // compare with K*nproc*(chunk+1), K=2 by default
1883       if ((T)remaining < pr->u.p.parm2) {
1884         // use dynamic-style schedule
1885         // atomically increment iterations, get old value
1886         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1887                                  (ST)chunk);
1888         remaining = trip - init;
1889         if (remaining <= 0) {
1890           status = 0; // all iterations got by other threads
1891         } else {
1892           // got some iterations to work on
1893           status = 1;
1894           if ((T)remaining > chunk) {
1895             limit = init + chunk - 1;
1896           } else {
1897             last = true; // the last chunk
1898             limit = init + remaining - 1;
1899           } // if
1900         } // if
1901         break;
1902       } // if
1903       // divide by K*nproc
1904       UT span;
1905       __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1906                          &span);
1907       UT rem = span % chunk;
1908       if (rem) // adjust so that span%chunk == 0
1909         span += chunk - rem;
1910       limit = init + span;
1911       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1912                                (ST)init, (ST)limit)) {
1913         // CAS was successful, chunk obtained
1914         status = 1;
1915         --limit;
1916         break;
1917       } // if
1918     } // while
1919     if (status != 0) {
1920       start = pr->u.p.lb;
1921       incr = pr->u.p.st;
1922       if (p_st != NULL)
1923         *p_st = incr;
1924       *p_lb = start + init * incr;
1925       *p_ub = start + limit * incr;
1926       if (pr->flags.ordered) {
1927         pr->u.p.ordered_lower = init;
1928         pr->u.p.ordered_upper = limit;
1929       } // if
1930     } else {
1931       *p_lb = 0;
1932       *p_ub = 0;
1933       if (p_st != NULL)
1934         *p_st = 0;
1935     } // if
1936   } // case
1937   break;
1938 
1939   case kmp_sch_guided_analytical_chunked: {
1940     T chunkspec = pr->u.p.parm1;
1941     UT chunkIdx;
1942 #if KMP_USE_X87CONTROL
1943     /* for storing original FPCW value for Windows* OS on
1944        IA-32 architecture 8-byte version */
1945     unsigned int oldFpcw;
1946     unsigned int fpcwSet = 0;
1947 #endif
1948     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1949                    "kmp_sch_guided_analytical_chunked case\n",
1950                    gtid));
1951 
1952     trip = pr->u.p.tc;
1953 
1954     KMP_DEBUG_ASSERT(nproc > 1);
1955     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1956 
1957     while (1) { /* this while loop is a safeguard against unexpected zero
1958                    chunk sizes */
1959       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1960       if (chunkIdx >= (UT)pr->u.p.parm2) {
1961         --trip;
1962         /* use dynamic-style scheduling */
1963         init = chunkIdx * chunkspec + pr->u.p.count;
1964         /* need to verify init > 0 in case of overflow in the above
1965          * calculation */
1966         if ((status = (init > 0 && init <= trip)) != 0) {
1967           limit = init + chunkspec - 1;
1968 
1969           if ((last = (limit >= trip)) != 0)
1970             limit = trip;
1971         }
1972         break;
1973       } else {
1974 /* use exponential-style scheduling */
1975 /* The following check is to workaround the lack of long double precision on
1976    Windows* OS.
1977    This check works around the possible effect that init != 0 for chunkIdx == 0.
1978  */
1979 #if KMP_USE_X87CONTROL
1980         /* If we haven't already done so, save original
1981            FPCW and set precision to 64-bit, as Windows* OS
1982            on IA-32 architecture defaults to 53-bit */
1983         if (!fpcwSet) {
1984           oldFpcw = _control87(0, 0);
1985           _control87(_PC_64, _MCW_PC);
1986           fpcwSet = 0x30000;
1987         }
1988 #endif
1989         if (chunkIdx) {
1990           init = __kmp_dispatch_guided_remaining<T>(
1991               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1992           KMP_DEBUG_ASSERT(init);
1993           init = trip - init;
1994         } else
1995           init = 0;
1996         limit = trip - __kmp_dispatch_guided_remaining<T>(
1997                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1998         KMP_ASSERT(init <= limit);
1999         if (init < limit) {
2000           KMP_DEBUG_ASSERT(limit <= trip);
2001           --limit;
2002           status = 1;
2003           break;
2004         } // if
2005       } // if
2006     } // while (1)
2007 #if KMP_USE_X87CONTROL
2008     /* restore FPCW if necessary
2009        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2010     */
2011     if (fpcwSet && (oldFpcw & fpcwSet))
2012       _control87(oldFpcw, _MCW_PC);
2013 #endif
2014     if (status != 0) {
2015       start = pr->u.p.lb;
2016       incr = pr->u.p.st;
2017       if (p_st != NULL)
2018         *p_st = incr;
2019       *p_lb = start + init * incr;
2020       *p_ub = start + limit * incr;
2021       if (pr->flags.ordered) {
2022         pr->u.p.ordered_lower = init;
2023         pr->u.p.ordered_upper = limit;
2024       }
2025     } else {
2026       *p_lb = 0;
2027       *p_ub = 0;
2028       if (p_st != NULL)
2029         *p_st = 0;
2030     }
2031   } // case
2032   break;
2033 
2034   case kmp_sch_trapezoidal: {
2035     UT index;
2036     T parm2 = pr->u.p.parm2;
2037     T parm3 = pr->u.p.parm3;
2038     T parm4 = pr->u.p.parm4;
2039     KD_TRACE(100,
2040              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
2041               gtid));
2042 
2043     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2044 
2045     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2046     trip = pr->u.p.tc - 1;
2047 
2048     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2049       *p_lb = 0;
2050       *p_ub = 0;
2051       if (p_st != NULL)
2052         *p_st = 0;
2053     } else {
2054       start = pr->u.p.lb;
2055       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2056       incr = pr->u.p.st;
2057 
2058       if ((last = (limit >= trip)) != 0)
2059         limit = trip;
2060 
2061       if (p_st != NULL)
2062         *p_st = incr;
2063 
2064       if (incr == 1) {
2065         *p_lb = start + init;
2066         *p_ub = start + limit;
2067       } else {
2068         *p_lb = start + init * incr;
2069         *p_ub = start + limit * incr;
2070       }
2071 
2072       if (pr->flags.ordered) {
2073         pr->u.p.ordered_lower = init;
2074         pr->u.p.ordered_upper = limit;
2075       } // if
2076     } // if
2077   } // case
2078   break;
2079   default: {
2080     status = 0; // to avoid complaints on uninitialized variable use
2081     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
2082                 KMP_HNT(GetNewerLibrary), // Hint
2083                 __kmp_msg_null // Variadic argument list terminator
2084     );
2085   } break;
2086   } // switch
2087   if (p_last)
2088     *p_last = last;
2089 #ifdef KMP_DEBUG
2090   if (pr->flags.ordered) {
2091     char *buff;
2092     // create format specifiers before the debug output
2093     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
2094                             "ordered_lower:%%%s ordered_upper:%%%s\n",
2095                             traits_t<UT>::spec, traits_t<UT>::spec);
2096     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
2097     __kmp_str_free(&buff);
2098   }
2099   {
2100     char *buff;
2101     // create format specifiers before the debug output
2102     buff = __kmp_str_format(
2103         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
2104         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
2105         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2106     KMP_DEBUG_ASSERT(p_last);
2107     KMP_DEBUG_ASSERT(p_st);
2108     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
2109     __kmp_str_free(&buff);
2110   }
2111 #endif
2112   return status;
2113 }
2114 
2115 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
2116    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
2117    is not called. */
2118 #if OMPT_SUPPORT && OMPT_OPTIONAL
2119 #define OMPT_LOOP_END                                                          \
2120   if (status == 0) {                                                           \
2121     if (ompt_enabled.ompt_callback_work) {                                     \
2122       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
2123       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
2124       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
2125           ompt_get_work_schedule(pr->schedule), ompt_scope_end,                \
2126           &(team_info->parallel_data), &(task_info->task_data), 0, codeptr);   \
2127     }                                                                          \
2128   }
2129 #define OMPT_LOOP_DISPATCH(lb, ub, st, status)                                 \
2130   if (ompt_enabled.ompt_callback_dispatch && status) {                         \
2131     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);                \
2132     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);              \
2133     ompt_dispatch_chunk_t chunk;                                               \
2134     ompt_data_t instance = ompt_data_none;                                     \
2135     OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st);                                \
2136     instance.ptr = &chunk;                                                     \
2137     ompt_callbacks.ompt_callback(ompt_callback_dispatch)(                      \
2138         &(team_info->parallel_data), &(task_info->task_data),                  \
2139         ompt_dispatch_ws_loop_chunk, instance);                                \
2140   }
2141 // TODO: implement count
2142 #else
2143 #define OMPT_LOOP_END // no-op
2144 #define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op
2145 #endif
2146 
2147 #if KMP_STATS_ENABLED
2148 #define KMP_STATS_LOOP_END                                                     \
2149   {                                                                            \
2150     kmp_int64 u, l, t, i;                                                      \
2151     l = (kmp_int64)(*p_lb);                                                    \
2152     u = (kmp_int64)(*p_ub);                                                    \
2153     i = (kmp_int64)(pr->u.p.st);                                               \
2154     if (status == 0) {                                                         \
2155       t = 0;                                                                   \
2156       KMP_POP_PARTITIONED_TIMER();                                             \
2157     } else if (i == 1) {                                                       \
2158       if (u >= l)                                                              \
2159         t = u - l + 1;                                                         \
2160       else                                                                     \
2161         t = 0;                                                                 \
2162     } else if (i < 0) {                                                        \
2163       if (l >= u)                                                              \
2164         t = (l - u) / (-i) + 1;                                                \
2165       else                                                                     \
2166         t = 0;                                                                 \
2167     } else {                                                                   \
2168       if (u >= l)                                                              \
2169         t = (u - l) / i + 1;                                                   \
2170       else                                                                     \
2171         t = 0;                                                                 \
2172     }                                                                          \
2173     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
2174   }
2175 #else
2176 #define KMP_STATS_LOOP_END /* Nothing */
2177 #endif
2178 
2179 template <typename T>
2180 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
2181                                T *p_lb, T *p_ub,
2182                                typename traits_t<T>::signed_t *p_st
2183 #if OMPT_SUPPORT && OMPT_OPTIONAL
2184                                ,
2185                                void *codeptr
2186 #endif
2187 ) {
2188 
2189   typedef typename traits_t<T>::unsigned_t UT;
2190   typedef typename traits_t<T>::signed_t ST;
2191   // This is potentially slightly misleading, schedule(runtime) will appear here
2192   // even if the actual runtime schedule is static. (Which points out a
2193   // disadvantage of schedule(runtime): even when static scheduling is used it
2194   // costs more than a compile time choice to use static scheduling would.)
2195   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2196 
2197   int status;
2198   dispatch_private_info_template<T> *pr;
2199   __kmp_assert_valid_gtid(gtid);
2200   kmp_info_t *th = __kmp_threads[gtid];
2201   kmp_team_t *team = th->th.th_team;
2202 
2203   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2204   KD_TRACE(
2205       1000,
2206       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2207        gtid, p_lb, p_ub, p_st, p_last));
2208 
2209   if (team->t.t_serialized) {
2210     /* NOTE: serialize this dispatch because we are not at the active level */
2211     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2212         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
2213     KMP_DEBUG_ASSERT(pr);
2214 
2215     if ((status = (pr->u.p.tc != 0)) == 0) {
2216       *p_lb = 0;
2217       *p_ub = 0;
2218       //            if ( p_last != NULL )
2219       //                *p_last = 0;
2220       if (p_st != NULL)
2221         *p_st = 0;
2222       if (__kmp_env_consistency_check) {
2223         if (pr->pushed_ws != ct_none) {
2224           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2225         }
2226       }
2227     } else if (pr->flags.nomerge) {
2228       kmp_int32 last;
2229       T start;
2230       UT limit, trip, init;
2231       ST incr;
2232       T chunk = pr->u.p.parm1;
2233 
2234       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2235                      gtid));
2236 
2237       init = chunk * pr->u.p.count++;
2238       trip = pr->u.p.tc - 1;
2239 
2240       if ((status = (init <= trip)) == 0) {
2241         *p_lb = 0;
2242         *p_ub = 0;
2243         //                if ( p_last != NULL )
2244         //                    *p_last = 0;
2245         if (p_st != NULL)
2246           *p_st = 0;
2247         if (__kmp_env_consistency_check) {
2248           if (pr->pushed_ws != ct_none) {
2249             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2250           }
2251         }
2252       } else {
2253         start = pr->u.p.lb;
2254         limit = chunk + init - 1;
2255         incr = pr->u.p.st;
2256 
2257         if ((last = (limit >= trip)) != 0) {
2258           limit = trip;
2259 #if KMP_OS_WINDOWS
2260           pr->u.p.last_upper = pr->u.p.ub;
2261 #endif /* KMP_OS_WINDOWS */
2262         }
2263         if (p_last != NULL)
2264           *p_last = last;
2265         if (p_st != NULL)
2266           *p_st = incr;
2267         if (incr == 1) {
2268           *p_lb = start + init;
2269           *p_ub = start + limit;
2270         } else {
2271           *p_lb = start + init * incr;
2272           *p_ub = start + limit * incr;
2273         }
2274 
2275         if (pr->flags.ordered) {
2276           pr->u.p.ordered_lower = init;
2277           pr->u.p.ordered_upper = limit;
2278 #ifdef KMP_DEBUG
2279           {
2280             char *buff;
2281             // create format specifiers before the debug output
2282             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2283                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
2284                                     traits_t<UT>::spec, traits_t<UT>::spec);
2285             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2286                             pr->u.p.ordered_upper));
2287             __kmp_str_free(&buff);
2288           }
2289 #endif
2290         } // if
2291       } // if
2292     } else {
2293       pr->u.p.tc = 0;
2294       *p_lb = pr->u.p.lb;
2295       *p_ub = pr->u.p.ub;
2296 #if KMP_OS_WINDOWS
2297       pr->u.p.last_upper = *p_ub;
2298 #endif /* KMP_OS_WINDOWS */
2299       if (p_last != NULL)
2300         *p_last = TRUE;
2301       if (p_st != NULL)
2302         *p_st = pr->u.p.st;
2303     } // if
2304 #ifdef KMP_DEBUG
2305     {
2306       char *buff;
2307       // create format specifiers before the debug output
2308       buff = __kmp_str_format(
2309           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2310           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2311           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2312       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2313                     (p_last ? *p_last : 0), status));
2314       __kmp_str_free(&buff);
2315     }
2316 #endif
2317 #if INCLUDE_SSC_MARKS
2318     SSC_MARK_DISPATCH_NEXT();
2319 #endif
2320     OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2321     OMPT_LOOP_END;
2322     KMP_STATS_LOOP_END;
2323     return status;
2324   } else {
2325     kmp_int32 last = 0;
2326     dispatch_shared_info_template<T> volatile *sh;
2327 
2328     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2329                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2330 
2331     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2332         th->th.th_dispatch->th_dispatch_pr_current);
2333     KMP_DEBUG_ASSERT(pr);
2334     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2335         th->th.th_dispatch->th_dispatch_sh_current);
2336     KMP_DEBUG_ASSERT(sh);
2337 
2338 #if KMP_USE_HIER_SCHED
2339     if (pr->flags.use_hier)
2340       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2341     else
2342 #endif // KMP_USE_HIER_SCHED
2343       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2344                                                 p_st, th->th.th_team_nproc,
2345                                                 th->th.th_info.ds.ds_tid);
2346     // status == 0: no more iterations to execute
2347     if (status == 0) {
2348       ST num_done;
2349       num_done = test_then_inc<ST>(&sh->u.s.num_done);
2350 #ifdef KMP_DEBUG
2351       {
2352         char *buff;
2353         // create format specifiers before the debug output
2354         buff = __kmp_str_format(
2355             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2356             traits_t<ST>::spec);
2357         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2358         __kmp_str_free(&buff);
2359       }
2360 #endif
2361 
2362 #if KMP_USE_HIER_SCHED
2363       pr->flags.use_hier = FALSE;
2364 #endif
2365       if (num_done == th->th.th_team_nproc - 1) {
2366 #if KMP_STATIC_STEAL_ENABLED
2367         if (pr->schedule == kmp_sch_static_steal) {
2368           int i;
2369           int idx = (th->th.th_dispatch->th_disp_index - 1) %
2370                     __kmp_dispatch_num_buffers; // current loop index
2371           // loop complete, safe to destroy locks used for stealing
2372           for (i = 0; i < th->th.th_team_nproc; ++i) {
2373             dispatch_private_info_template<T> *buf =
2374                 reinterpret_cast<dispatch_private_info_template<T> *>(
2375                     &team->t.t_dispatch[i].th_disp_buffer[idx]);
2376             KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2377             KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2378             if (traits_t<T>::type_size > 4) {
2379               // destroy locks used for stealing
2380               kmp_lock_t *lck = buf->u.p.steal_lock;
2381               KMP_ASSERT(lck != NULL);
2382               __kmp_destroy_lock(lck);
2383               __kmp_free(lck);
2384               buf->u.p.steal_lock = NULL;
2385             }
2386           }
2387         }
2388 #endif
2389         /* NOTE: release shared buffer to be reused */
2390 
2391         KMP_MB(); /* Flush all pending memory write invalidates.  */
2392 
2393         sh->u.s.num_done = 0;
2394         sh->u.s.iteration = 0;
2395 
2396         /* TODO replace with general release procedure? */
2397         if (pr->flags.ordered) {
2398           sh->u.s.ordered_iteration = 0;
2399         }
2400 
2401         KMP_MB(); /* Flush all pending memory write invalidates.  */
2402 
2403         sh->buffer_index += __kmp_dispatch_num_buffers;
2404         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2405                        gtid, sh->buffer_index));
2406 
2407         KMP_MB(); /* Flush all pending memory write invalidates.  */
2408 
2409       } // if
2410       if (__kmp_env_consistency_check) {
2411         if (pr->pushed_ws != ct_none) {
2412           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2413         }
2414       }
2415 
2416       th->th.th_dispatch->th_deo_fcn = NULL;
2417       th->th.th_dispatch->th_dxo_fcn = NULL;
2418       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2419       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2420     } // if (status == 0)
2421 #if KMP_OS_WINDOWS
2422     else if (last) {
2423       pr->u.p.last_upper = pr->u.p.ub;
2424     }
2425 #endif /* KMP_OS_WINDOWS */
2426     if (p_last != NULL && status != 0)
2427       *p_last = last;
2428   } // if
2429 
2430 #ifdef KMP_DEBUG
2431   {
2432     char *buff;
2433     // create format specifiers before the debug output
2434     buff = __kmp_str_format(
2435         "__kmp_dispatch_next: T#%%d normal case: "
2436         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2437         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2438     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2439                   (p_last ? *p_last : 0), status));
2440     __kmp_str_free(&buff);
2441   }
2442 #endif
2443 #if INCLUDE_SSC_MARKS
2444   SSC_MARK_DISPATCH_NEXT();
2445 #endif
2446   OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2447   OMPT_LOOP_END;
2448   KMP_STATS_LOOP_END;
2449   return status;
2450 }
2451 
2452 /*!
2453 @ingroup WORK_SHARING
2454 @param loc  source location information
2455 @param global_tid  global thread number
2456 @return Zero if the parallel region is not active and this thread should execute
2457 all sections, non-zero otherwise.
2458 
2459 Beginning of sections construct.
2460 There are no implicit barriers in the "sections" calls, rather the compiler
2461 should introduce an explicit barrier if it is required.
2462 
2463 This implementation is based on __kmp_dispatch_init, using same constructs for
2464 shared data (we can't have sections nested directly in omp for loop, there
2465 should be a parallel region in between)
2466 */
2467 kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {
2468 
2469   int active;
2470   kmp_info_t *th;
2471   kmp_team_t *team;
2472   kmp_uint32 my_buffer_index;
2473   dispatch_shared_info_template<kmp_int32> volatile *sh;
2474 
2475   KMP_DEBUG_ASSERT(__kmp_init_serial);
2476 
2477   if (!TCR_4(__kmp_init_parallel))
2478     __kmp_parallel_initialize();
2479   __kmp_resume_if_soft_paused();
2480 
2481   /* setup data */
2482   th = __kmp_threads[gtid];
2483   team = th->th.th_team;
2484   active = !team->t.t_serialized;
2485   th->th.th_ident = loc;
2486 
2487   KMP_COUNT_BLOCK(OMP_SECTIONS);
2488   KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));
2489 
2490   if (active) {
2491     // Setup sections in the same way as dynamic scheduled loops.
2492     // We need one shared data: which section is to execute next.
2493     // (in case parallel is not active, all sections will be executed on the
2494     // same thread)
2495     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2496                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2497 
2498     my_buffer_index = th->th.th_dispatch->th_disp_index++;
2499 
2500     // reuse shared data structures from dynamic sched loops:
2501     sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2502         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
2503     KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
2504                   my_buffer_index));
2505 
2506     th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
2507     th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
2508 
2509     KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
2510                    "sh->buffer_index:%d\n",
2511                    gtid, my_buffer_index, sh->buffer_index));
2512     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
2513                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
2514     // Note: KMP_WAIT() cannot be used there: buffer index and
2515     // my_buffer_index are *always* 32-bit integers.
2516     KMP_MB();
2517     KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
2518                    "sh->buffer_index:%d\n",
2519                    gtid, my_buffer_index, sh->buffer_index));
2520 
2521     th->th.th_dispatch->th_dispatch_pr_current =
2522         nullptr; // sections construct doesn't need private data
2523     th->th.th_dispatch->th_dispatch_sh_current =
2524         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
2525   }
2526 
2527 #if OMPT_SUPPORT && OMPT_OPTIONAL
2528   if (ompt_enabled.ompt_callback_work) {
2529     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2530     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2531     ompt_callbacks.ompt_callback(ompt_callback_work)(
2532         ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
2533         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2534   }
2535 #endif
2536   KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
2537 
2538   return active;
2539 }
2540 
2541 /*!
2542 @ingroup WORK_SHARING
2543 @param loc  source location information
2544 @param global_tid  global thread number
2545 @param numberOfSections  number of sections in the 'sections' construct
2546 @return unsigned [from 0 to n) - number (id) of the section to execute next on
2547 this thread. n (or any other number not in range) - nothing to execute on this
2548 thread
2549 */
2550 
2551 kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
2552                               kmp_int32 numberOfSections) {
2553 
2554   KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead);
2555 
2556   kmp_info_t *th = __kmp_threads[gtid];
2557 #ifdef KMP_DEBUG
2558   kmp_team_t *team = th->th.th_team;
2559 #endif
2560 
2561   KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
2562                   numberOfSections));
2563 
2564   // For serialized case we should not call this function:
2565   KMP_DEBUG_ASSERT(!team->t.t_serialized);
2566 
2567   dispatch_shared_info_template<kmp_int32> volatile *sh;
2568 
2569   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2570                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2571 
2572   KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
2573   sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2574       th->th.th_dispatch->th_dispatch_sh_current);
2575   KMP_DEBUG_ASSERT(sh);
2576 
2577   kmp_int32 sectionIndex = 0;
2578   bool moreSectionsToExecute = true;
2579 
2580   // Find section to execute:
2581   sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
2582   if (sectionIndex >= numberOfSections) {
2583     moreSectionsToExecute = false;
2584   }
2585 
2586   // status == 0: no more sections to execute;
2587   // OMPTODO: __kmpc_end_sections could be bypassed?
2588   if (!moreSectionsToExecute) {
2589     kmp_int32 num_done;
2590 
2591     num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
2592 
2593     if (num_done == th->th.th_team_nproc - 1) {
2594       /* NOTE: release this buffer to be reused */
2595 
2596       KMP_MB(); /* Flush all pending memory write invalidates.  */
2597 
2598       sh->u.s.num_done = 0;
2599       sh->u.s.iteration = 0;
2600 
2601       KMP_MB(); /* Flush all pending memory write invalidates.  */
2602 
2603       sh->buffer_index += __kmp_dispatch_num_buffers;
2604       KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
2605                      sh->buffer_index));
2606 
2607       KMP_MB(); /* Flush all pending memory write invalidates.  */
2608 
2609     } // if
2610 
2611     th->th.th_dispatch->th_deo_fcn = NULL;
2612     th->th.th_dispatch->th_dxo_fcn = NULL;
2613     th->th.th_dispatch->th_dispatch_sh_current = NULL;
2614     th->th.th_dispatch->th_dispatch_pr_current = NULL;
2615 
2616 #if OMPT_SUPPORT && OMPT_OPTIONAL
2617     if (ompt_enabled.ompt_callback_dispatch) {
2618       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2619       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2620       ompt_data_t instance = ompt_data_none;
2621       instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
2622       ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
2623           &(team_info->parallel_data), &(task_info->task_data),
2624           ompt_dispatch_section, instance);
2625     }
2626 #endif
2627   }
2628 
2629   return sectionIndex;
2630 }
2631 
2632 /*!
2633 @ingroup WORK_SHARING
2634 @param loc  source location information
2635 @param global_tid  global thread number
2636 
2637 End of "sections" construct.
2638 Don't need to wait here: barrier is added separately when needed.
2639 */
2640 void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
2641 
2642   kmp_info_t *th = __kmp_threads[gtid];
2643   int active = !th->th.th_team->t.t_serialized;
2644 
2645   KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));
2646 
2647   if (!active) {
2648     // In active case call finalization is done in __kmpc_next_section
2649 #if OMPT_SUPPORT && OMPT_OPTIONAL
2650     if (ompt_enabled.ompt_callback_work) {
2651       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2652       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2653       ompt_callbacks.ompt_callback(ompt_callback_work)(
2654           ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
2655           &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2656     }
2657 #endif
2658   }
2659 
2660   KMP_POP_PARTITIONED_TIMER();
2661   KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
2662 }
2663 
2664 template <typename T>
2665 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2666                                   kmp_int32 *plastiter, T *plower, T *pupper,
2667                                   typename traits_t<T>::signed_t incr) {
2668   typedef typename traits_t<T>::unsigned_t UT;
2669   kmp_uint32 team_id;
2670   kmp_uint32 nteams;
2671   UT trip_count;
2672   kmp_team_t *team;
2673   kmp_info_t *th;
2674 
2675   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2676   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2677 #ifdef KMP_DEBUG
2678   typedef typename traits_t<T>::signed_t ST;
2679   {
2680     char *buff;
2681     // create format specifiers before the debug output
2682     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2683                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2684                             traits_t<T>::spec, traits_t<T>::spec,
2685                             traits_t<ST>::spec, traits_t<T>::spec);
2686     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2687     __kmp_str_free(&buff);
2688   }
2689 #endif
2690 
2691   if (__kmp_env_consistency_check) {
2692     if (incr == 0) {
2693       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2694                             loc);
2695     }
2696     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2697       // The loop is illegal.
2698       // Some zero-trip loops maintained by compiler, e.g.:
2699       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2700       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2701       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2702       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2703       // Compiler does not check the following illegal loops:
2704       //   for(i=0;i<10;i+=incr) // where incr<0
2705       //   for(i=10;i>0;i-=incr) // where incr<0
2706       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2707     }
2708   }
2709   __kmp_assert_valid_gtid(gtid);
2710   th = __kmp_threads[gtid];
2711   team = th->th.th_team;
2712   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2713   nteams = th->th.th_teams_size.nteams;
2714   team_id = team->t.t_master_tid;
2715   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2716 
2717   // compute global trip count
2718   if (incr == 1) {
2719     trip_count = *pupper - *plower + 1;
2720   } else if (incr == -1) {
2721     trip_count = *plower - *pupper + 1;
2722   } else if (incr > 0) {
2723     // upper-lower can exceed the limit of signed type
2724     trip_count = (UT)(*pupper - *plower) / incr + 1;
2725   } else {
2726     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2727   }
2728 
2729   if (trip_count <= nteams) {
2730     KMP_DEBUG_ASSERT(
2731         __kmp_static == kmp_sch_static_greedy ||
2732         __kmp_static ==
2733             kmp_sch_static_balanced); // Unknown static scheduling type.
2734     // only some teams get single iteration, others get nothing
2735     if (team_id < trip_count) {
2736       *pupper = *plower = *plower + team_id * incr;
2737     } else {
2738       *plower = *pupper + incr; // zero-trip loop
2739     }
2740     if (plastiter != NULL)
2741       *plastiter = (team_id == trip_count - 1);
2742   } else {
2743     if (__kmp_static == kmp_sch_static_balanced) {
2744       UT chunk = trip_count / nteams;
2745       UT extras = trip_count % nteams;
2746       *plower +=
2747           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2748       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2749       if (plastiter != NULL)
2750         *plastiter = (team_id == nteams - 1);
2751     } else {
2752       T chunk_inc_count =
2753           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2754       T upper = *pupper;
2755       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2756       // Unknown static scheduling type.
2757       *plower += team_id * chunk_inc_count;
2758       *pupper = *plower + chunk_inc_count - incr;
2759       // Check/correct bounds if needed
2760       if (incr > 0) {
2761         if (*pupper < *plower)
2762           *pupper = traits_t<T>::max_value;
2763         if (plastiter != NULL)
2764           *plastiter = *plower <= upper && *pupper > upper - incr;
2765         if (*pupper > upper)
2766           *pupper = upper; // tracker C73258
2767       } else {
2768         if (*pupper > *plower)
2769           *pupper = traits_t<T>::min_value;
2770         if (plastiter != NULL)
2771           *plastiter = *plower >= upper && *pupper < upper - incr;
2772         if (*pupper < upper)
2773           *pupper = upper; // tracker C73258
2774       }
2775     }
2776   }
2777 }
2778 
2779 //-----------------------------------------------------------------------------
2780 // Dispatch routines
2781 //    Transfer call to template< type T >
2782 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2783 //                         T lb, T ub, ST st, ST chunk )
2784 extern "C" {
2785 
2786 /*!
2787 @ingroup WORK_SHARING
2788 @{
2789 @param loc Source location
2790 @param gtid Global thread id
2791 @param schedule Schedule type
2792 @param lb  Lower bound
2793 @param ub  Upper bound
2794 @param st  Step (or increment if you prefer)
2795 @param chunk The chunk size to block with
2796 
2797 This function prepares the runtime to start a dynamically scheduled for loop,
2798 saving the loop arguments.
2799 These functions are all identical apart from the types of the arguments.
2800 */
2801 
2802 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2803                             enum sched_type schedule, kmp_int32 lb,
2804                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2805   KMP_DEBUG_ASSERT(__kmp_init_serial);
2806 #if OMPT_SUPPORT && OMPT_OPTIONAL
2807   OMPT_STORE_RETURN_ADDRESS(gtid);
2808 #endif
2809   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2810 }
2811 /*!
2812 See @ref __kmpc_dispatch_init_4
2813 */
2814 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2815                              enum sched_type schedule, kmp_uint32 lb,
2816                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2817   KMP_DEBUG_ASSERT(__kmp_init_serial);
2818 #if OMPT_SUPPORT && OMPT_OPTIONAL
2819   OMPT_STORE_RETURN_ADDRESS(gtid);
2820 #endif
2821   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2822 }
2823 
2824 /*!
2825 See @ref __kmpc_dispatch_init_4
2826 */
2827 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2828                             enum sched_type schedule, kmp_int64 lb,
2829                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2830   KMP_DEBUG_ASSERT(__kmp_init_serial);
2831 #if OMPT_SUPPORT && OMPT_OPTIONAL
2832   OMPT_STORE_RETURN_ADDRESS(gtid);
2833 #endif
2834   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2835 }
2836 
2837 /*!
2838 See @ref __kmpc_dispatch_init_4
2839 */
2840 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2841                              enum sched_type schedule, kmp_uint64 lb,
2842                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2843   KMP_DEBUG_ASSERT(__kmp_init_serial);
2844 #if OMPT_SUPPORT && OMPT_OPTIONAL
2845   OMPT_STORE_RETURN_ADDRESS(gtid);
2846 #endif
2847   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2848 }
2849 
2850 /*!
2851 See @ref __kmpc_dispatch_init_4
2852 
2853 Difference from __kmpc_dispatch_init set of functions is these functions
2854 are called for composite distribute parallel for construct. Thus before
2855 regular iterations dispatching we need to calc per-team iteration space.
2856 
2857 These functions are all identical apart from the types of the arguments.
2858 */
2859 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2860                                  enum sched_type schedule, kmp_int32 *p_last,
2861                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2862                                  kmp_int32 chunk) {
2863   KMP_DEBUG_ASSERT(__kmp_init_serial);
2864 #if OMPT_SUPPORT && OMPT_OPTIONAL
2865   OMPT_STORE_RETURN_ADDRESS(gtid);
2866 #endif
2867   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2868   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2869 }
2870 
2871 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2872                                   enum sched_type schedule, kmp_int32 *p_last,
2873                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2874                                   kmp_int32 chunk) {
2875   KMP_DEBUG_ASSERT(__kmp_init_serial);
2876 #if OMPT_SUPPORT && OMPT_OPTIONAL
2877   OMPT_STORE_RETURN_ADDRESS(gtid);
2878 #endif
2879   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2880   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2881 }
2882 
2883 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2884                                  enum sched_type schedule, kmp_int32 *p_last,
2885                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2886                                  kmp_int64 chunk) {
2887   KMP_DEBUG_ASSERT(__kmp_init_serial);
2888 #if OMPT_SUPPORT && OMPT_OPTIONAL
2889   OMPT_STORE_RETURN_ADDRESS(gtid);
2890 #endif
2891   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2892   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2893 }
2894 
2895 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2896                                   enum sched_type schedule, kmp_int32 *p_last,
2897                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2898                                   kmp_int64 chunk) {
2899   KMP_DEBUG_ASSERT(__kmp_init_serial);
2900 #if OMPT_SUPPORT && OMPT_OPTIONAL
2901   OMPT_STORE_RETURN_ADDRESS(gtid);
2902 #endif
2903   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2904   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2905 }
2906 
2907 /*!
2908 @param loc Source code location
2909 @param gtid Global thread id
2910 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2911 otherwise
2912 @param p_lb   Pointer to the lower bound for the next chunk of work
2913 @param p_ub   Pointer to the upper bound for the next chunk of work
2914 @param p_st   Pointer to the stride for the next chunk of work
2915 @return one if there is work to be done, zero otherwise
2916 
2917 Get the next dynamically allocated chunk of work for this thread.
2918 If there is no more work, then the lb,ub and stride need not be modified.
2919 */
2920 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2921                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2922 #if OMPT_SUPPORT && OMPT_OPTIONAL
2923   OMPT_STORE_RETURN_ADDRESS(gtid);
2924 #endif
2925   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2926 #if OMPT_SUPPORT && OMPT_OPTIONAL
2927                                         ,
2928                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2929 #endif
2930   );
2931 }
2932 
2933 /*!
2934 See @ref __kmpc_dispatch_next_4
2935 */
2936 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2937                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2938                             kmp_int32 *p_st) {
2939 #if OMPT_SUPPORT && OMPT_OPTIONAL
2940   OMPT_STORE_RETURN_ADDRESS(gtid);
2941 #endif
2942   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2943 #if OMPT_SUPPORT && OMPT_OPTIONAL
2944                                          ,
2945                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2946 #endif
2947   );
2948 }
2949 
2950 /*!
2951 See @ref __kmpc_dispatch_next_4
2952 */
2953 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2954                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2955 #if OMPT_SUPPORT && OMPT_OPTIONAL
2956   OMPT_STORE_RETURN_ADDRESS(gtid);
2957 #endif
2958   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2959 #if OMPT_SUPPORT && OMPT_OPTIONAL
2960                                         ,
2961                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2962 #endif
2963   );
2964 }
2965 
2966 /*!
2967 See @ref __kmpc_dispatch_next_4
2968 */
2969 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2970                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2971                             kmp_int64 *p_st) {
2972 #if OMPT_SUPPORT && OMPT_OPTIONAL
2973   OMPT_STORE_RETURN_ADDRESS(gtid);
2974 #endif
2975   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2976 #if OMPT_SUPPORT && OMPT_OPTIONAL
2977                                          ,
2978                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2979 #endif
2980   );
2981 }
2982 
2983 /*!
2984 @param loc Source code location
2985 @param gtid Global thread id
2986 
2987 Mark the end of a dynamic loop.
2988 */
2989 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2990   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2991 }
2992 
2993 /*!
2994 See @ref __kmpc_dispatch_fini_4
2995 */
2996 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2997   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2998 }
2999 
3000 /*!
3001 See @ref __kmpc_dispatch_fini_4
3002 */
3003 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
3004   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
3005 }
3006 
3007 /*!
3008 See @ref __kmpc_dispatch_fini_4
3009 */
3010 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
3011   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
3012 }
3013 
3014 /*!
3015 See @ref __kmpc_dispatch_deinit
3016 */
3017 void __kmpc_dispatch_deinit(ident_t *loc, kmp_int32 gtid) {}
3018 /*! @} */
3019 
3020 //-----------------------------------------------------------------------------
3021 // Non-template routines from kmp_dispatch.cpp used in other sources
3022 
3023 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
3024   return value == checker;
3025 }
3026 
3027 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
3028   return value != checker;
3029 }
3030 
3031 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
3032   return value < checker;
3033 }
3034 
3035 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
3036   return value >= checker;
3037 }
3038 
3039 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
3040   return value <= checker;
3041 }
3042 
3043 kmp_uint32
3044 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
3045              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
3046              void *obj // Higher-level synchronization object, or NULL.
3047 ) {
3048   // note: we may not belong to a team at this point
3049   volatile kmp_uint32 *spin = spinner;
3050   kmp_uint32 check = checker;
3051   kmp_uint32 spins;
3052   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
3053   kmp_uint32 r;
3054   kmp_uint64 time;
3055 
3056   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
3057   KMP_INIT_YIELD(spins);
3058   KMP_INIT_BACKOFF(time);
3059   // main wait spin loop
3060   while (!f(r = TCR_4(*spin), check)) {
3061     KMP_FSYNC_SPIN_PREPARE(obj);
3062     /* GEH - remove this since it was accidentally introduced when kmp_wait was
3063        split. It causes problems with infinite recursion because of exit lock */
3064     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
3065         __kmp_abort_thread(); */
3066     KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3067   }
3068   KMP_FSYNC_SPIN_ACQUIRED(obj);
3069   return r;
3070 }
3071 
3072 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
3073                       kmp_uint32 (*pred)(void *, kmp_uint32),
3074                       void *obj // Higher-level synchronization object, or NULL.
3075 ) {
3076   // note: we may not belong to a team at this point
3077   void *spin = spinner;
3078   kmp_uint32 check = checker;
3079   kmp_uint32 spins;
3080   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
3081   kmp_uint64 time;
3082 
3083   KMP_FSYNC_SPIN_INIT(obj, spin);
3084   KMP_INIT_YIELD(spins);
3085   KMP_INIT_BACKOFF(time);
3086   // main wait spin loop
3087   while (!f(spin, check)) {
3088     KMP_FSYNC_SPIN_PREPARE(obj);
3089     /* if we have waited a bit, or are noversubscribed, yield */
3090     /* pause is in the following code */
3091     KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3092   }
3093   KMP_FSYNC_SPIN_ACQUIRED(obj);
3094 }
3095 
3096 } // extern "C"
3097 
3098 #ifdef KMP_GOMP_COMPAT
3099 
3100 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
3101                                enum sched_type schedule, kmp_int32 lb,
3102                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
3103                                int push_ws) {
3104   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
3105                                  push_ws);
3106 }
3107 
3108 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
3109                                 enum sched_type schedule, kmp_uint32 lb,
3110                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
3111                                 int push_ws) {
3112   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
3113                                   push_ws);
3114 }
3115 
3116 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
3117                                enum sched_type schedule, kmp_int64 lb,
3118                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
3119                                int push_ws) {
3120   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
3121                                  push_ws);
3122 }
3123 
3124 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
3125                                 enum sched_type schedule, kmp_uint64 lb,
3126                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
3127                                 int push_ws) {
3128   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
3129                                   push_ws);
3130 }
3131 
3132 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
3133   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3134 }
3135 
3136 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
3137   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3138 }
3139 
3140 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
3141   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3142 }
3143 
3144 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
3145   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3146 }
3147 
3148 #endif /* KMP_GOMP_COMPAT */
3149 
3150 /* ------------------------------------------------------------------------ */
3151