xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_dispatch.h (revision 5e801ac66d24704442eba426ed13c3effb8a34e7)
1 /*
2  * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_DISPATCH_H
14 #define KMP_DISPATCH_H
15 
16 /* ------------------------------------------------------------------------ */
17 /* ------------------------------------------------------------------------ */
18 
19 #include "kmp.h"
20 #include "kmp_error.h"
21 #include "kmp_i18n.h"
22 #include "kmp_itt.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #if KMP_OS_WINDOWS && KMP_ARCH_X86
26 #include <float.h>
27 #endif
28 
29 #if OMPT_SUPPORT
30 #include "ompt-internal.h"
31 #include "ompt-specific.h"
32 #endif
33 
34 /* ------------------------------------------------------------------------ */
35 /* ------------------------------------------------------------------------ */
36 #if KMP_USE_HIER_SCHED
37 // Forward declarations of some hierarchical scheduling data structures
38 template <typename T> struct kmp_hier_t;
39 template <typename T> struct kmp_hier_top_unit_t;
40 #endif // KMP_USE_HIER_SCHED
41 
42 template <typename T> struct dispatch_shared_info_template;
43 template <typename T> struct dispatch_private_info_template;
44 
45 template <typename T>
46 extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
47                                           dispatch_private_info_template<T> *pr,
48                                           enum sched_type schedule, T lb, T ub,
49                                           typename traits_t<T>::signed_t st,
50 #if USE_ITT_BUILD
51                                           kmp_uint64 *cur_chunk,
52 #endif
53                                           typename traits_t<T>::signed_t chunk,
54                                           T nproc, T unit_id);
55 template <typename T>
56 extern int __kmp_dispatch_next_algorithm(
57     int gtid, dispatch_private_info_template<T> *pr,
58     dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
59     T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
60 
61 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
62 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
63 
64 #if KMP_STATIC_STEAL_ENABLED
65 
66 // replaces dispatch_private_info{32,64} structures and
67 // dispatch_private_info{32,64}_t types
68 template <typename T> struct dispatch_private_infoXX_template {
69   typedef typename traits_t<T>::unsigned_t UT;
70   typedef typename traits_t<T>::signed_t ST;
71   UT count; // unsigned
72   T ub;
73   /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
74   T lb;
75   ST st; // signed
76   UT tc; // unsigned
77   kmp_lock_t *steal_lock; // lock used for chunk stealing
78   /* parm[1-4] are used in different ways by different scheduling algorithms */
79 
80   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
81   //    a) parm3 is properly aligned and
82   //    b) all parm1-4 are in the same cache line.
83   // Because of parm1-4 are used together, performance seems to be better
84   // if they are in the same line (not measured though).
85 
86   struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
87     T parm1;
88     T parm2;
89     T parm3;
90     T parm4;
91   };
92 
93   UT ordered_lower; // unsigned
94   UT ordered_upper; // unsigned
95 #if KMP_OS_WINDOWS
96   T last_upper;
97 #endif /* KMP_OS_WINDOWS */
98 };
99 
100 #else /* KMP_STATIC_STEAL_ENABLED */
101 
102 // replaces dispatch_private_info{32,64} structures and
103 // dispatch_private_info{32,64}_t types
104 template <typename T> struct dispatch_private_infoXX_template {
105   typedef typename traits_t<T>::unsigned_t UT;
106   typedef typename traits_t<T>::signed_t ST;
107   T lb;
108   T ub;
109   ST st; // signed
110   UT tc; // unsigned
111 
112   T parm1;
113   T parm2;
114   T parm3;
115   T parm4;
116 
117   UT count; // unsigned
118 
119   UT ordered_lower; // unsigned
120   UT ordered_upper; // unsigned
121 #if KMP_OS_WINDOWS
122   T last_upper;
123 #endif /* KMP_OS_WINDOWS */
124 };
125 #endif /* KMP_STATIC_STEAL_ENABLED */
126 
127 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
128   // duplicate alignment here, otherwise size of structure is not correct in our
129   // compiler
130   union KMP_ALIGN_CACHE private_info_tmpl {
131     dispatch_private_infoXX_template<T> p;
132     dispatch_private_info64_t p64;
133   } u;
134   enum sched_type schedule; /* scheduling algorithm */
135   kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
136   std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer
137   kmp_uint32 ordered_bumped;
138   dispatch_private_info *next; /* stack of buffers for nest of serial regions */
139   kmp_uint32 type_size;
140 #if KMP_USE_HIER_SCHED
141   kmp_int32 hier_id;
142   kmp_hier_top_unit_t<T> *hier_parent;
143   // member functions
144   kmp_int32 get_hier_id() const { return hier_id; }
145   kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
146 #endif
147   enum cons_type pushed_ws;
148 };
149 
150 // replaces dispatch_shared_info{32,64} structures and
151 // dispatch_shared_info{32,64}_t types
152 template <typename T> struct dispatch_shared_infoXX_template {
153   typedef typename traits_t<T>::unsigned_t UT;
154   typedef typename traits_t<T>::signed_t ST;
155   /* chunk index under dynamic, number of idle threads under static-steal;
156      iteration index otherwise */
157   volatile UT iteration;
158   volatile ST num_done;
159   volatile UT ordered_iteration;
160   // to retain the structure size making ordered_iteration scalar
161   UT ordered_dummy[KMP_MAX_ORDERED - 3];
162 };
163 
164 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
165 template <typename T> struct dispatch_shared_info_template {
166   typedef typename traits_t<T>::unsigned_t UT;
167   // we need union here to keep the structure size
168   union shared_info_tmpl {
169     dispatch_shared_infoXX_template<UT> s;
170     dispatch_shared_info64_t s64;
171   } u;
172   volatile kmp_uint32 buffer_index;
173   volatile kmp_int32 doacross_buf_idx; // teamwise index
174   kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
175   kmp_int32 doacross_num_done; // count finished threads
176 #if KMP_USE_HIER_SCHED
177   kmp_hier_t<T> *hier;
178 #endif
179 #if KMP_USE_HWLOC
180   // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
181   // machines (> 48 cores). Performance analysis showed that a cache thrash
182   // was occurring and this padding helps alleviate the problem.
183   char padding[64];
184 #endif
185 };
186 
187 /* ------------------------------------------------------------------------ */
188 /* ------------------------------------------------------------------------ */
189 
190 #undef USE_TEST_LOCKS
191 
192 // test_then_add template (general template should NOT be used)
193 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
194 
195 template <>
196 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
197                                                  kmp_int32 d) {
198   kmp_int32 r;
199   r = KMP_TEST_THEN_ADD32(p, d);
200   return r;
201 }
202 
203 template <>
204 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
205                                                  kmp_int64 d) {
206   kmp_int64 r;
207   r = KMP_TEST_THEN_ADD64(p, d);
208   return r;
209 }
210 
211 // test_then_inc_acq template (general template should NOT be used)
212 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
213 
214 template <>
215 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
216   kmp_int32 r;
217   r = KMP_TEST_THEN_INC_ACQ32(p);
218   return r;
219 }
220 
221 template <>
222 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
223   kmp_int64 r;
224   r = KMP_TEST_THEN_INC_ACQ64(p);
225   return r;
226 }
227 
228 // test_then_inc template (general template should NOT be used)
229 template <typename T> static __forceinline T test_then_inc(volatile T *p);
230 
231 template <>
232 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
233   kmp_int32 r;
234   r = KMP_TEST_THEN_INC32(p);
235   return r;
236 }
237 
238 template <>
239 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
240   kmp_int64 r;
241   r = KMP_TEST_THEN_INC64(p);
242   return r;
243 }
244 
245 // compare_and_swap template (general template should NOT be used)
246 template <typename T>
247 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
248 
249 template <>
250 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
251                                                     kmp_int32 c, kmp_int32 s) {
252   return KMP_COMPARE_AND_STORE_REL32(p, c, s);
253 }
254 
255 template <>
256 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
257                                                     kmp_int64 c, kmp_int64 s) {
258   return KMP_COMPARE_AND_STORE_REL64(p, c, s);
259 }
260 
261 template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
262   return value >= checker;
263 }
264 template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
265   return value == checker;
266 }
267 
268 /*
269     Spin wait loop that pauses between checks.
270     Waits until function returns non-zero when called with *spinner and check.
271     Does NOT put threads to sleep.
272     Arguments:
273         UT is unsigned 4- or 8-byte type
274         spinner - memory location to check value
275         checker - value which spinner is >, <, ==, etc.
276         pred - predicate function to perform binary comparison of some sort
277 #if USE_ITT_BUILD
278         obj -- is higher-level synchronization object to report to ittnotify. It
279         is used to report locks consistently. For example, if lock is acquired
280         immediately, its address is reported to ittnotify via
281         KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
282         and lock routine calls to KMP_WAIT(), the later should report the
283         same address, not an address of low-level spinner.
284 #endif // USE_ITT_BUILD
285     TODO: make inline function (move to header file for icl)
286 */
287 template <typename UT>
288 static UT __kmp_wait(volatile UT *spinner, UT checker,
289                      kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {
290   // note: we may not belong to a team at this point
291   volatile UT *spin = spinner;
292   UT check = checker;
293   kmp_uint32 spins;
294   kmp_uint32 (*f)(UT, UT) = pred;
295   UT r;
296 
297   KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
298   KMP_INIT_YIELD(spins);
299   // main wait spin loop
300   while (!f(r = *spin, check)) {
301     KMP_FSYNC_SPIN_PREPARE(obj);
302     /* GEH - remove this since it was accidentally introduced when kmp_wait was
303        split.
304        It causes problems with infinite recursion because of exit lock */
305     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
306         __kmp_abort_thread(); */
307     // If oversubscribed, or have waited a bit then yield.
308     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
309   }
310   KMP_FSYNC_SPIN_ACQUIRED(obj);
311   return r;
312 }
313 
314 /* ------------------------------------------------------------------------ */
315 /* ------------------------------------------------------------------------ */
316 
317 template <typename UT>
318 void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
319   dispatch_private_info_template<UT> *pr;
320 
321   int gtid = *gtid_ref;
322   //    int  cid = *cid_ref;
323   kmp_info_t *th = __kmp_threads[gtid];
324   KMP_DEBUG_ASSERT(th->th.th_dispatch);
325 
326   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
327   if (__kmp_env_consistency_check) {
328     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
329         th->th.th_dispatch->th_dispatch_pr_current);
330     if (pr->pushed_ws != ct_none) {
331 #if KMP_USE_DYNAMIC_LOCK
332       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
333 #else
334       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
335 #endif
336     }
337   }
338 
339   if (!th->th.th_team->t.t_serialized) {
340     dispatch_shared_info_template<UT> *sh =
341         reinterpret_cast<dispatch_shared_info_template<UT> *>(
342             th->th.th_dispatch->th_dispatch_sh_current);
343     UT lower;
344 
345     if (!__kmp_env_consistency_check) {
346       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
347           th->th.th_dispatch->th_dispatch_pr_current);
348     }
349     lower = pr->u.p.ordered_lower;
350 
351 #if !defined(KMP_GOMP_COMPAT)
352     if (__kmp_env_consistency_check) {
353       if (pr->ordered_bumped) {
354         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
355         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
356                                ct_ordered_in_pdo, loc_ref,
357                                &p->stack_data[p->w_top]);
358       }
359     }
360 #endif /* !defined(KMP_GOMP_COMPAT) */
361 
362     KMP_MB();
363 #ifdef KMP_DEBUG
364     {
365       char *buff;
366       // create format specifiers before the debug output
367       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
368                               "ordered_iter:%%%s lower:%%%s\n",
369                               traits_t<UT>::spec, traits_t<UT>::spec);
370       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
371       __kmp_str_free(&buff);
372     }
373 #endif
374     __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
375                    __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
376     KMP_MB(); /* is this necessary? */
377 #ifdef KMP_DEBUG
378     {
379       char *buff;
380       // create format specifiers before the debug output
381       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
382                               "ordered_iter:%%%s lower:%%%s\n",
383                               traits_t<UT>::spec, traits_t<UT>::spec);
384       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
385       __kmp_str_free(&buff);
386     }
387 #endif
388   }
389   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
390 }
391 
392 template <typename UT>
393 void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
394   typedef typename traits_t<UT>::signed_t ST;
395   dispatch_private_info_template<UT> *pr;
396 
397   int gtid = *gtid_ref;
398   //    int  cid = *cid_ref;
399   kmp_info_t *th = __kmp_threads[gtid];
400   KMP_DEBUG_ASSERT(th->th.th_dispatch);
401 
402   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
403   if (__kmp_env_consistency_check) {
404     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
405         th->th.th_dispatch->th_dispatch_pr_current);
406     if (pr->pushed_ws != ct_none) {
407       __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
408     }
409   }
410 
411   if (!th->th.th_team->t.t_serialized) {
412     dispatch_shared_info_template<UT> *sh =
413         reinterpret_cast<dispatch_shared_info_template<UT> *>(
414             th->th.th_dispatch->th_dispatch_sh_current);
415 
416     if (!__kmp_env_consistency_check) {
417       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
418           th->th.th_dispatch->th_dispatch_pr_current);
419     }
420 
421     KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
422 #if !defined(KMP_GOMP_COMPAT)
423     if (__kmp_env_consistency_check) {
424       if (pr->ordered_bumped != 0) {
425         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
426         /* How to test it? - OM */
427         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
428                                ct_ordered_in_pdo, loc_ref,
429                                &p->stack_data[p->w_top]);
430       }
431     }
432 #endif /* !defined(KMP_GOMP_COMPAT) */
433 
434     KMP_MB(); /* Flush all pending memory write invalidates.  */
435 
436     pr->ordered_bumped += 1;
437 
438     KD_TRACE(1000,
439              ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
440               gtid, pr->ordered_bumped));
441 
442     KMP_MB(); /* Flush all pending memory write invalidates.  */
443 
444     /* TODO use general release procedure? */
445     test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
446 
447     KMP_MB(); /* Flush all pending memory write invalidates.  */
448   }
449   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
450 }
451 
452 /* Computes and returns x to the power of y, where y must a non-negative integer
453  */
454 template <typename UT>
455 static __forceinline long double __kmp_pow(long double x, UT y) {
456   long double s = 1.0L;
457 
458   KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
459   // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
460   while (y) {
461     if (y & 1)
462       s *= x;
463     x *= x;
464     y >>= 1;
465   }
466   return s;
467 }
468 
469 /* Computes and returns the number of unassigned iterations after idx chunks
470    have been assigned
471    (the total number of unassigned iterations in chunks with index greater than
472    or equal to idx).
473    __forceinline seems to be broken so that if we __forceinline this function,
474    the behavior is wrong
475    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
476 */
477 template <typename T>
478 static __inline typename traits_t<T>::unsigned_t
479 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
480                                 typename traits_t<T>::unsigned_t idx) {
481   /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
482      least for ICL 8.1, long double arithmetic may not really have
483      long double precision, even with /Qlong_double.  Currently, we
484      workaround that in the caller code, by manipulating the FPCW for
485      Windows* OS on IA-32 architecture.  The lack of precision is not
486      expected to be a correctness issue, though.
487   */
488   typedef typename traits_t<T>::unsigned_t UT;
489 
490   long double x = tc * __kmp_pow<UT>(base, idx);
491   UT r = (UT)x;
492   if (x == r)
493     return r;
494   return r + 1;
495 }
496 
497 // Parameters of the guided-iterative algorithm:
498 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
499 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
500 // by default n = 2. For example with n = 3 the chunks distribution will be more
501 // flat.
502 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
503 static const int guided_int_param = 2;
504 static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
505 #endif // KMP_DISPATCH_H
506