1 /* 2 * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef KMP_DISPATCH_H 14 #define KMP_DISPATCH_H 15 16 /* ------------------------------------------------------------------------ */ 17 /* ------------------------------------------------------------------------ */ 18 19 #include "kmp.h" 20 #include "kmp_error.h" 21 #include "kmp_i18n.h" 22 #include "kmp_itt.h" 23 #include "kmp_stats.h" 24 #include "kmp_str.h" 25 #if KMP_OS_WINDOWS && KMP_ARCH_X86 26 #include <float.h> 27 #endif 28 29 #if OMPT_SUPPORT 30 #include "ompt-internal.h" 31 #include "ompt-specific.h" 32 #endif 33 34 /* ------------------------------------------------------------------------ */ 35 /* ------------------------------------------------------------------------ */ 36 #if KMP_USE_HIER_SCHED 37 // Forward declarations of some hierarchical scheduling data structures 38 template <typename T> struct kmp_hier_t; 39 template <typename T> struct kmp_hier_top_unit_t; 40 #endif // KMP_USE_HIER_SCHED 41 42 template <typename T> struct dispatch_shared_info_template; 43 template <typename T> struct dispatch_private_info_template; 44 45 template <typename T> 46 extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 47 dispatch_private_info_template<T> *pr, 48 enum sched_type schedule, T lb, T ub, 49 typename traits_t<T>::signed_t st, 50 #if USE_ITT_BUILD 51 kmp_uint64 *cur_chunk, 52 #endif 53 typename traits_t<T>::signed_t chunk, 54 T nproc, T unit_id); 55 template <typename T> 56 extern int __kmp_dispatch_next_algorithm( 57 int gtid, dispatch_private_info_template<T> *pr, 58 dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb, 59 T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id); 60 61 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref); 62 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref); 63 64 #if KMP_STATIC_STEAL_ENABLED 65 66 // replaces dispatch_private_info{32,64} structures and 67 // dispatch_private_info{32,64}_t types 68 template <typename T> struct dispatch_private_infoXX_template { 69 typedef typename traits_t<T>::unsigned_t UT; 70 typedef typename traits_t<T>::signed_t ST; 71 UT count; // unsigned 72 T ub; 73 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 74 T lb; 75 ST st; // signed 76 UT tc; // unsigned 77 kmp_lock_t *steal_lock; // lock used for chunk stealing 78 /* parm[1-4] are used in different ways by different scheduling algorithms */ 79 80 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 81 // a) parm3 is properly aligned and 82 // b) all parm1-4 are in the same cache line. 83 // Because of parm1-4 are used together, performance seems to be better 84 // if they are in the same line (not measured though). 85 86 struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4 87 T parm1; 88 T parm2; 89 T parm3; 90 T parm4; 91 }; 92 93 UT ordered_lower; // unsigned 94 UT ordered_upper; // unsigned 95 #if KMP_OS_WINDOWS 96 T last_upper; 97 #endif /* KMP_OS_WINDOWS */ 98 }; 99 100 #else /* KMP_STATIC_STEAL_ENABLED */ 101 102 // replaces dispatch_private_info{32,64} structures and 103 // dispatch_private_info{32,64}_t types 104 template <typename T> struct dispatch_private_infoXX_template { 105 typedef typename traits_t<T>::unsigned_t UT; 106 typedef typename traits_t<T>::signed_t ST; 107 T lb; 108 T ub; 109 ST st; // signed 110 UT tc; // unsigned 111 112 T parm1; 113 T parm2; 114 T parm3; 115 T parm4; 116 117 UT count; // unsigned 118 119 UT ordered_lower; // unsigned 120 UT ordered_upper; // unsigned 121 #if KMP_OS_WINDOWS 122 T last_upper; 123 #endif /* KMP_OS_WINDOWS */ 124 }; 125 #endif /* KMP_STATIC_STEAL_ENABLED */ 126 127 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template { 128 // duplicate alignment here, otherwise size of structure is not correct in our 129 // compiler 130 union KMP_ALIGN_CACHE private_info_tmpl { 131 dispatch_private_infoXX_template<T> p; 132 dispatch_private_info64_t p64; 133 } u; 134 enum sched_type schedule; /* scheduling algorithm */ 135 kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */ 136 std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer 137 kmp_uint32 ordered_bumped; 138 dispatch_private_info *next; /* stack of buffers for nest of serial regions */ 139 kmp_uint32 type_size; 140 #if KMP_USE_HIER_SCHED 141 kmp_int32 hier_id; 142 kmp_hier_top_unit_t<T> *hier_parent; 143 // member functions 144 kmp_int32 get_hier_id() const { return hier_id; } 145 kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; } 146 #endif 147 enum cons_type pushed_ws; 148 }; 149 150 // replaces dispatch_shared_info{32,64} structures and 151 // dispatch_shared_info{32,64}_t types 152 template <typename T> struct dispatch_shared_infoXX_template { 153 typedef typename traits_t<T>::unsigned_t UT; 154 typedef typename traits_t<T>::signed_t ST; 155 /* chunk index under dynamic, number of idle threads under static-steal; 156 iteration index otherwise */ 157 volatile UT iteration; 158 volatile ST num_done; 159 volatile UT ordered_iteration; 160 // to retain the structure size making ordered_iteration scalar 161 UT ordered_dummy[KMP_MAX_ORDERED - 3]; 162 }; 163 164 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 165 template <typename T> struct dispatch_shared_info_template { 166 typedef typename traits_t<T>::unsigned_t UT; 167 // we need union here to keep the structure size 168 union shared_info_tmpl { 169 dispatch_shared_infoXX_template<UT> s; 170 dispatch_shared_info64_t s64; 171 } u; 172 volatile kmp_uint32 buffer_index; 173 volatile kmp_int32 doacross_buf_idx; // teamwise index 174 kmp_uint32 *doacross_flags; // array of iteration flags (0/1) 175 kmp_int32 doacross_num_done; // count finished threads 176 #if KMP_USE_HIER_SCHED 177 kmp_hier_t<T> *hier; 178 #endif 179 #if KMP_USE_HWLOC 180 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big 181 // machines (> 48 cores). Performance analysis showed that a cache thrash 182 // was occurring and this padding helps alleviate the problem. 183 char padding[64]; 184 #endif 185 }; 186 187 /* ------------------------------------------------------------------------ */ 188 /* ------------------------------------------------------------------------ */ 189 190 #undef USE_TEST_LOCKS 191 192 // test_then_add template (general template should NOT be used) 193 template <typename T> static __forceinline T test_then_add(volatile T *p, T d); 194 195 template <> 196 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p, 197 kmp_int32 d) { 198 kmp_int32 r; 199 r = KMP_TEST_THEN_ADD32(p, d); 200 return r; 201 } 202 203 template <> 204 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p, 205 kmp_int64 d) { 206 kmp_int64 r; 207 r = KMP_TEST_THEN_ADD64(p, d); 208 return r; 209 } 210 211 // test_then_inc_acq template (general template should NOT be used) 212 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p); 213 214 template <> 215 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) { 216 kmp_int32 r; 217 r = KMP_TEST_THEN_INC_ACQ32(p); 218 return r; 219 } 220 221 template <> 222 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) { 223 kmp_int64 r; 224 r = KMP_TEST_THEN_INC_ACQ64(p); 225 return r; 226 } 227 228 // test_then_inc template (general template should NOT be used) 229 template <typename T> static __forceinline T test_then_inc(volatile T *p); 230 231 template <> 232 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) { 233 kmp_int32 r; 234 r = KMP_TEST_THEN_INC32(p); 235 return r; 236 } 237 238 template <> 239 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) { 240 kmp_int64 r; 241 r = KMP_TEST_THEN_INC64(p); 242 return r; 243 } 244 245 // compare_and_swap template (general template should NOT be used) 246 template <typename T> 247 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s); 248 249 template <> 250 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p, 251 kmp_int32 c, kmp_int32 s) { 252 return KMP_COMPARE_AND_STORE_REL32(p, c, s); 253 } 254 255 template <> 256 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p, 257 kmp_int64 c, kmp_int64 s) { 258 return KMP_COMPARE_AND_STORE_REL64(p, c, s); 259 } 260 261 template <typename T> kmp_uint32 __kmp_ge(T value, T checker) { 262 return value >= checker; 263 } 264 template <typename T> kmp_uint32 __kmp_eq(T value, T checker) { 265 return value == checker; 266 } 267 268 /* 269 Spin wait loop that pauses between checks. 270 Waits until function returns non-zero when called with *spinner and check. 271 Does NOT put threads to sleep. 272 Arguments: 273 UT is unsigned 4- or 8-byte type 274 spinner - memory location to check value 275 checker - value which spinner is >, <, ==, etc. 276 pred - predicate function to perform binary comparison of some sort 277 #if USE_ITT_BUILD 278 obj -- is higher-level synchronization object to report to ittnotify. It 279 is used to report locks consistently. For example, if lock is acquired 280 immediately, its address is reported to ittnotify via 281 KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately 282 and lock routine calls to KMP_WAIT(), the later should report the 283 same address, not an address of low-level spinner. 284 #endif // USE_ITT_BUILD 285 TODO: make inline function (move to header file for icl) 286 */ 287 template <typename UT> 288 static UT __kmp_wait(volatile UT *spinner, UT checker, 289 kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) { 290 // note: we may not belong to a team at this point 291 volatile UT *spin = spinner; 292 UT check = checker; 293 kmp_uint32 spins; 294 kmp_uint32 (*f)(UT, UT) = pred; 295 UT r; 296 297 KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin)); 298 KMP_INIT_YIELD(spins); 299 // main wait spin loop 300 while (!f(r = *spin, check)) { 301 KMP_FSYNC_SPIN_PREPARE(obj); 302 /* GEH - remove this since it was accidentally introduced when kmp_wait was 303 split. 304 It causes problems with infinite recursion because of exit lock */ 305 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 306 __kmp_abort_thread(); */ 307 // If oversubscribed, or have waited a bit then yield. 308 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 309 } 310 KMP_FSYNC_SPIN_ACQUIRED(obj); 311 return r; 312 } 313 314 /* ------------------------------------------------------------------------ */ 315 /* ------------------------------------------------------------------------ */ 316 317 template <typename UT> 318 void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 319 dispatch_private_info_template<UT> *pr; 320 321 int gtid = *gtid_ref; 322 // int cid = *cid_ref; 323 kmp_info_t *th = __kmp_threads[gtid]; 324 KMP_DEBUG_ASSERT(th->th.th_dispatch); 325 326 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid)); 327 if (__kmp_env_consistency_check) { 328 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 329 th->th.th_dispatch->th_dispatch_pr_current); 330 if (pr->pushed_ws != ct_none) { 331 #if KMP_USE_DYNAMIC_LOCK 332 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0); 333 #else 334 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL); 335 #endif 336 } 337 } 338 339 if (!th->th.th_team->t.t_serialized) { 340 dispatch_shared_info_template<UT> *sh = 341 reinterpret_cast<dispatch_shared_info_template<UT> *>( 342 th->th.th_dispatch->th_dispatch_sh_current); 343 UT lower; 344 345 if (!__kmp_env_consistency_check) { 346 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 347 th->th.th_dispatch->th_dispatch_pr_current); 348 } 349 lower = pr->u.p.ordered_lower; 350 351 #if !defined(KMP_GOMP_COMPAT) 352 if (__kmp_env_consistency_check) { 353 if (pr->ordered_bumped) { 354 struct cons_header *p = __kmp_threads[gtid]->th.th_cons; 355 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, 356 ct_ordered_in_pdo, loc_ref, 357 &p->stack_data[p->w_top]); 358 } 359 } 360 #endif /* !defined(KMP_GOMP_COMPAT) */ 361 362 KMP_MB(); 363 #ifdef KMP_DEBUG 364 { 365 char *buff; 366 // create format specifiers before the debug output 367 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: " 368 "ordered_iter:%%%s lower:%%%s\n", 369 traits_t<UT>::spec, traits_t<UT>::spec); 370 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 371 __kmp_str_free(&buff); 372 } 373 #endif 374 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 375 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 376 KMP_MB(); /* is this necessary? */ 377 #ifdef KMP_DEBUG 378 { 379 char *buff; 380 // create format specifiers before the debug output 381 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: " 382 "ordered_iter:%%%s lower:%%%s\n", 383 traits_t<UT>::spec, traits_t<UT>::spec); 384 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 385 __kmp_str_free(&buff); 386 } 387 #endif 388 } 389 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid)); 390 } 391 392 template <typename UT> 393 void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 394 typedef typename traits_t<UT>::signed_t ST; 395 dispatch_private_info_template<UT> *pr; 396 397 int gtid = *gtid_ref; 398 // int cid = *cid_ref; 399 kmp_info_t *th = __kmp_threads[gtid]; 400 KMP_DEBUG_ASSERT(th->th.th_dispatch); 401 402 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid)); 403 if (__kmp_env_consistency_check) { 404 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 405 th->th.th_dispatch->th_dispatch_pr_current); 406 if (pr->pushed_ws != ct_none) { 407 __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref); 408 } 409 } 410 411 if (!th->th.th_team->t.t_serialized) { 412 dispatch_shared_info_template<UT> *sh = 413 reinterpret_cast<dispatch_shared_info_template<UT> *>( 414 th->th.th_dispatch->th_dispatch_sh_current); 415 416 if (!__kmp_env_consistency_check) { 417 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 418 th->th.th_dispatch->th_dispatch_pr_current); 419 } 420 421 KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration)); 422 #if !defined(KMP_GOMP_COMPAT) 423 if (__kmp_env_consistency_check) { 424 if (pr->ordered_bumped != 0) { 425 struct cons_header *p = __kmp_threads[gtid]->th.th_cons; 426 /* How to test it? - OM */ 427 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, 428 ct_ordered_in_pdo, loc_ref, 429 &p->stack_data[p->w_top]); 430 } 431 } 432 #endif /* !defined(KMP_GOMP_COMPAT) */ 433 434 KMP_MB(); /* Flush all pending memory write invalidates. */ 435 436 pr->ordered_bumped += 1; 437 438 KD_TRACE(1000, 439 ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 440 gtid, pr->ordered_bumped)); 441 442 KMP_MB(); /* Flush all pending memory write invalidates. */ 443 444 /* TODO use general release procedure? */ 445 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 446 447 KMP_MB(); /* Flush all pending memory write invalidates. */ 448 } 449 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid)); 450 } 451 452 /* Computes and returns x to the power of y, where y must a non-negative integer 453 */ 454 template <typename UT> 455 static __forceinline long double __kmp_pow(long double x, UT y) { 456 long double s = 1.0L; 457 458 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 459 // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 460 while (y) { 461 if (y & 1) 462 s *= x; 463 x *= x; 464 y >>= 1; 465 } 466 return s; 467 } 468 469 /* Computes and returns the number of unassigned iterations after idx chunks 470 have been assigned 471 (the total number of unassigned iterations in chunks with index greater than 472 or equal to idx). 473 __forceinline seems to be broken so that if we __forceinline this function, 474 the behavior is wrong 475 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) 476 */ 477 template <typename T> 478 static __inline typename traits_t<T>::unsigned_t 479 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base, 480 typename traits_t<T>::unsigned_t idx) { 481 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at 482 least for ICL 8.1, long double arithmetic may not really have 483 long double precision, even with /Qlong_double. Currently, we 484 workaround that in the caller code, by manipulating the FPCW for 485 Windows* OS on IA-32 architecture. The lack of precision is not 486 expected to be a correctness issue, though. 487 */ 488 typedef typename traits_t<T>::unsigned_t UT; 489 490 long double x = tc * __kmp_pow<UT>(base, idx); 491 UT r = (UT)x; 492 if (x == r) 493 return r; 494 return r + 1; 495 } 496 497 // Parameters of the guided-iterative algorithm: 498 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 499 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 500 // by default n = 2. For example with n = 3 the chunks distribution will be more 501 // flat. 502 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 503 static const int guided_int_param = 2; 504 static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param; 505 #endif // KMP_DISPATCH_H 506