1 /* 2 * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef KMP_DISPATCH_H 14 #define KMP_DISPATCH_H 15 16 /* ------------------------------------------------------------------------ */ 17 /* ------------------------------------------------------------------------ */ 18 19 #include "kmp.h" 20 #include "kmp_error.h" 21 #include "kmp_i18n.h" 22 #include "kmp_itt.h" 23 #include "kmp_stats.h" 24 #include "kmp_str.h" 25 #if KMP_OS_WINDOWS && KMP_ARCH_X86 26 #include <float.h> 27 #endif 28 29 #if OMPT_SUPPORT 30 #include "ompt-internal.h" 31 #include "ompt-specific.h" 32 #endif 33 34 /* ------------------------------------------------------------------------ */ 35 /* ------------------------------------------------------------------------ */ 36 #if KMP_USE_HIER_SCHED 37 // Forward declarations of some hierarchical scheduling data structures 38 template <typename T> struct kmp_hier_t; 39 template <typename T> struct kmp_hier_top_unit_t; 40 #endif // KMP_USE_HIER_SCHED 41 42 template <typename T> struct dispatch_shared_info_template; 43 template <typename T> struct dispatch_private_info_template; 44 45 template <typename T> 46 extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 47 dispatch_private_info_template<T> *pr, 48 enum sched_type schedule, T lb, T ub, 49 typename traits_t<T>::signed_t st, 50 #if USE_ITT_BUILD 51 kmp_uint64 *cur_chunk, 52 #endif 53 typename traits_t<T>::signed_t chunk, 54 T nproc, T unit_id); 55 template <typename T> 56 extern int __kmp_dispatch_next_algorithm( 57 int gtid, dispatch_private_info_template<T> *pr, 58 dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb, 59 T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id); 60 61 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref); 62 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref); 63 64 #if KMP_STATIC_STEAL_ENABLED 65 66 // replaces dispatch_private_info{32,64} structures and 67 // dispatch_private_info{32,64}_t types 68 template <typename T> struct dispatch_private_infoXX_template { 69 typedef typename traits_t<T>::unsigned_t UT; 70 typedef typename traits_t<T>::signed_t ST; 71 UT count; // unsigned 72 T ub; 73 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 74 T lb; 75 ST st; // signed 76 UT tc; // unsigned 77 kmp_lock_t *steal_lock; // lock used for chunk stealing 78 79 UT ordered_lower; // unsigned 80 UT ordered_upper; // unsigned 81 82 /* parm[1-4] are used in different ways by different scheduling algorithms */ 83 84 // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on ) 85 // a) parm3 is properly aligned and 86 // b) all parm1-4 are in the same cache line. 87 // Because of parm1-4 are used together, performance seems to be better 88 // if they are in the same line (not measured though). 89 struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4 90 T parm1; 91 T parm2; 92 T parm3; 93 T parm4; 94 }; 95 96 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED 97 UT pchunks; // total number of chunks for processes with p-core 98 UT num_procs_with_pcore; // number of threads with p-core 99 T first_thread_with_ecore; 100 #endif 101 #if KMP_OS_WINDOWS 102 T last_upper; 103 #endif /* KMP_OS_WINDOWS */ 104 }; 105 106 #else /* KMP_STATIC_STEAL_ENABLED */ 107 108 // replaces dispatch_private_info{32,64} structures and 109 // dispatch_private_info{32,64}_t types 110 template <typename T> struct dispatch_private_infoXX_template { 111 typedef typename traits_t<T>::unsigned_t UT; 112 typedef typename traits_t<T>::signed_t ST; 113 T lb; 114 T ub; 115 ST st; // signed 116 UT tc; // unsigned 117 118 T parm1; 119 T parm2; 120 T parm3; 121 T parm4; 122 123 UT count; // unsigned 124 125 UT ordered_lower; // unsigned 126 UT ordered_upper; // unsigned 127 #if KMP_OS_WINDOWS 128 T last_upper; 129 #endif /* KMP_OS_WINDOWS */ 130 }; 131 #endif /* KMP_STATIC_STEAL_ENABLED */ 132 133 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template { 134 // duplicate alignment here, otherwise size of structure is not correct in our 135 // compiler 136 union KMP_ALIGN_CACHE private_info_tmpl { 137 dispatch_private_infoXX_template<T> p; 138 dispatch_private_info64_t p64; 139 } u; 140 enum sched_type schedule; /* scheduling algorithm */ 141 kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */ 142 std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer 143 kmp_uint32 ordered_bumped; 144 dispatch_private_info *next; /* stack of buffers for nest of serial regions */ 145 kmp_uint32 type_size; 146 #if KMP_USE_HIER_SCHED 147 kmp_int32 hier_id; 148 kmp_hier_top_unit_t<T> *hier_parent; 149 // member functions 150 kmp_int32 get_hier_id() const { return hier_id; } 151 kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; } 152 #endif 153 enum cons_type pushed_ws; 154 }; 155 156 // replaces dispatch_shared_info{32,64} structures and 157 // dispatch_shared_info{32,64}_t types 158 template <typename T> struct dispatch_shared_infoXX_template { 159 typedef typename traits_t<T>::unsigned_t UT; 160 typedef typename traits_t<T>::signed_t ST; 161 /* chunk index under dynamic, number of idle threads under static-steal; 162 iteration index otherwise */ 163 volatile UT iteration; 164 volatile ST num_done; 165 volatile UT ordered_iteration; 166 // to retain the structure size making ordered_iteration scalar 167 UT ordered_dummy[KMP_MAX_ORDERED - 3]; 168 }; 169 170 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 171 template <typename T> struct dispatch_shared_info_template { 172 typedef typename traits_t<T>::unsigned_t UT; 173 // we need union here to keep the structure size 174 union shared_info_tmpl { 175 dispatch_shared_infoXX_template<UT> s; 176 dispatch_shared_info64_t s64; 177 } u; 178 volatile kmp_uint32 buffer_index; 179 volatile kmp_int32 doacross_buf_idx; // teamwise index 180 kmp_uint32 *doacross_flags; // array of iteration flags (0/1) 181 kmp_int32 doacross_num_done; // count finished threads 182 #if KMP_USE_HIER_SCHED 183 kmp_hier_t<T> *hier; 184 #endif 185 #if KMP_USE_HWLOC 186 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big 187 // machines (> 48 cores). Performance analysis showed that a cache thrash 188 // was occurring and this padding helps alleviate the problem. 189 char padding[64]; 190 #endif 191 }; 192 193 /* ------------------------------------------------------------------------ */ 194 /* ------------------------------------------------------------------------ */ 195 196 #undef USE_TEST_LOCKS 197 198 // test_then_add template (general template should NOT be used) 199 template <typename T> static __forceinline T test_then_add(volatile T *p, T d); 200 201 template <> 202 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p, 203 kmp_int32 d) { 204 kmp_int32 r; 205 r = KMP_TEST_THEN_ADD32(p, d); 206 return r; 207 } 208 209 template <> 210 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p, 211 kmp_int64 d) { 212 kmp_int64 r; 213 r = KMP_TEST_THEN_ADD64(p, d); 214 return r; 215 } 216 217 // test_then_inc_acq template (general template should NOT be used) 218 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p); 219 220 template <> 221 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) { 222 kmp_int32 r; 223 r = KMP_TEST_THEN_INC_ACQ32(p); 224 return r; 225 } 226 227 template <> 228 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) { 229 kmp_int64 r; 230 r = KMP_TEST_THEN_INC_ACQ64(p); 231 return r; 232 } 233 234 // test_then_inc template (general template should NOT be used) 235 template <typename T> static __forceinline T test_then_inc(volatile T *p); 236 237 template <> 238 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) { 239 kmp_int32 r; 240 r = KMP_TEST_THEN_INC32(p); 241 return r; 242 } 243 244 template <> 245 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) { 246 kmp_int64 r; 247 r = KMP_TEST_THEN_INC64(p); 248 return r; 249 } 250 251 // compare_and_swap template (general template should NOT be used) 252 template <typename T> 253 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s); 254 255 template <> 256 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p, 257 kmp_int32 c, kmp_int32 s) { 258 return KMP_COMPARE_AND_STORE_REL32(p, c, s); 259 } 260 261 template <> 262 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p, 263 kmp_int64 c, kmp_int64 s) { 264 return KMP_COMPARE_AND_STORE_REL64(p, c, s); 265 } 266 267 template <typename T> kmp_uint32 __kmp_ge(T value, T checker) { 268 return value >= checker; 269 } 270 template <typename T> kmp_uint32 __kmp_eq(T value, T checker) { 271 return value == checker; 272 } 273 274 /* 275 Spin wait loop that pauses between checks. 276 Waits until function returns non-zero when called with *spinner and check. 277 Does NOT put threads to sleep. 278 Arguments: 279 UT is unsigned 4- or 8-byte type 280 spinner - memory location to check value 281 checker - value which spinner is >, <, ==, etc. 282 pred - predicate function to perform binary comparison of some sort 283 #if USE_ITT_BUILD 284 obj -- is higher-level synchronization object to report to ittnotify. It 285 is used to report locks consistently. For example, if lock is acquired 286 immediately, its address is reported to ittnotify via 287 KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately 288 and lock routine calls to KMP_WAIT(), the later should report the 289 same address, not an address of low-level spinner. 290 #endif // USE_ITT_BUILD 291 TODO: make inline function (move to header file for icl) 292 */ 293 template <typename UT> 294 static UT __kmp_wait(volatile UT *spinner, UT checker, 295 kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) { 296 // note: we may not belong to a team at this point 297 volatile UT *spin = spinner; 298 UT check = checker; 299 kmp_uint32 spins; 300 kmp_uint32 (*f)(UT, UT) = pred; 301 kmp_uint64 time; 302 UT r; 303 304 KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin)); 305 KMP_INIT_YIELD(spins); 306 KMP_INIT_BACKOFF(time); 307 // main wait spin loop 308 while (!f(r = *spin, check)) { 309 KMP_FSYNC_SPIN_PREPARE(obj); 310 /* GEH - remove this since it was accidentally introduced when kmp_wait was 311 split. 312 It causes problems with infinite recursion because of exit lock */ 313 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 314 __kmp_abort_thread(); */ 315 // If oversubscribed, or have waited a bit then yield. 316 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); 317 } 318 KMP_FSYNC_SPIN_ACQUIRED(obj); 319 return r; 320 } 321 322 /* ------------------------------------------------------------------------ */ 323 /* ------------------------------------------------------------------------ */ 324 325 template <typename UT> 326 void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 327 dispatch_private_info_template<UT> *pr; 328 329 int gtid = *gtid_ref; 330 // int cid = *cid_ref; 331 kmp_info_t *th = __kmp_threads[gtid]; 332 KMP_DEBUG_ASSERT(th->th.th_dispatch); 333 334 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid)); 335 if (__kmp_env_consistency_check) { 336 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 337 th->th.th_dispatch->th_dispatch_pr_current); 338 if (pr->pushed_ws != ct_none) { 339 #if KMP_USE_DYNAMIC_LOCK 340 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0); 341 #else 342 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL); 343 #endif 344 } 345 } 346 347 if (!th->th.th_team->t.t_serialized) { 348 dispatch_shared_info_template<UT> *sh = 349 reinterpret_cast<dispatch_shared_info_template<UT> *>( 350 th->th.th_dispatch->th_dispatch_sh_current); 351 UT lower; 352 353 if (!__kmp_env_consistency_check) { 354 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 355 th->th.th_dispatch->th_dispatch_pr_current); 356 } 357 lower = pr->u.p.ordered_lower; 358 359 #if !defined(KMP_GOMP_COMPAT) 360 if (__kmp_env_consistency_check) { 361 if (pr->ordered_bumped) { 362 struct cons_header *p = __kmp_threads[gtid]->th.th_cons; 363 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, 364 ct_ordered_in_pdo, loc_ref, 365 &p->stack_data[p->w_top]); 366 } 367 } 368 #endif /* !defined(KMP_GOMP_COMPAT) */ 369 370 KMP_MB(); 371 #ifdef KMP_DEBUG 372 { 373 char *buff; 374 // create format specifiers before the debug output 375 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: " 376 "ordered_iter:%%%s lower:%%%s\n", 377 traits_t<UT>::spec, traits_t<UT>::spec); 378 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 379 __kmp_str_free(&buff); 380 } 381 #endif 382 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 383 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 384 KMP_MB(); /* is this necessary? */ 385 #ifdef KMP_DEBUG 386 { 387 char *buff; 388 // create format specifiers before the debug output 389 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: " 390 "ordered_iter:%%%s lower:%%%s\n", 391 traits_t<UT>::spec, traits_t<UT>::spec); 392 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 393 __kmp_str_free(&buff); 394 } 395 #endif 396 } 397 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid)); 398 } 399 400 template <typename UT> 401 void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 402 typedef typename traits_t<UT>::signed_t ST; 403 dispatch_private_info_template<UT> *pr; 404 405 int gtid = *gtid_ref; 406 // int cid = *cid_ref; 407 kmp_info_t *th = __kmp_threads[gtid]; 408 KMP_DEBUG_ASSERT(th->th.th_dispatch); 409 410 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid)); 411 if (__kmp_env_consistency_check) { 412 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 413 th->th.th_dispatch->th_dispatch_pr_current); 414 if (pr->pushed_ws != ct_none) { 415 __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref); 416 } 417 } 418 419 if (!th->th.th_team->t.t_serialized) { 420 dispatch_shared_info_template<UT> *sh = 421 reinterpret_cast<dispatch_shared_info_template<UT> *>( 422 th->th.th_dispatch->th_dispatch_sh_current); 423 424 if (!__kmp_env_consistency_check) { 425 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 426 th->th.th_dispatch->th_dispatch_pr_current); 427 } 428 429 KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration)); 430 #if !defined(KMP_GOMP_COMPAT) 431 if (__kmp_env_consistency_check) { 432 if (pr->ordered_bumped != 0) { 433 struct cons_header *p = __kmp_threads[gtid]->th.th_cons; 434 /* How to test it? - OM */ 435 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, 436 ct_ordered_in_pdo, loc_ref, 437 &p->stack_data[p->w_top]); 438 } 439 } 440 #endif /* !defined(KMP_GOMP_COMPAT) */ 441 442 KMP_MB(); /* Flush all pending memory write invalidates. */ 443 444 pr->ordered_bumped += 1; 445 446 KD_TRACE(1000, 447 ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 448 gtid, pr->ordered_bumped)); 449 450 KMP_MB(); /* Flush all pending memory write invalidates. */ 451 452 /* TODO use general release procedure? */ 453 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 454 455 KMP_MB(); /* Flush all pending memory write invalidates. */ 456 } 457 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid)); 458 } 459 460 /* Computes and returns x to the power of y, where y must a non-negative integer 461 */ 462 template <typename UT> 463 static __forceinline long double __kmp_pow(long double x, UT y) { 464 long double s = 1.0L; 465 466 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 467 // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 468 while (y) { 469 if (y & 1) 470 s *= x; 471 x *= x; 472 y >>= 1; 473 } 474 return s; 475 } 476 477 /* Computes and returns the number of unassigned iterations after idx chunks 478 have been assigned 479 (the total number of unassigned iterations in chunks with index greater than 480 or equal to idx). 481 __forceinline seems to be broken so that if we __forceinline this function, 482 the behavior is wrong 483 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) 484 */ 485 template <typename T> 486 static __inline typename traits_t<T>::unsigned_t 487 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base, 488 typename traits_t<T>::unsigned_t idx) { 489 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at 490 least for ICL 8.1, long double arithmetic may not really have 491 long double precision, even with /Qlong_double. Currently, we 492 workaround that in the caller code, by manipulating the FPCW for 493 Windows* OS on IA-32 architecture. The lack of precision is not 494 expected to be a correctness issue, though. 495 */ 496 typedef typename traits_t<T>::unsigned_t UT; 497 498 long double x = tc * __kmp_pow<UT>(base, idx); 499 UT r = (UT)x; 500 if (x == r) 501 return r; 502 return r + 1; 503 } 504 505 // Parameters of the guided-iterative algorithm: 506 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 507 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 508 // by default n = 2. For example with n = 3 the chunks distribution will be more 509 // flat. 510 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 511 static const int guided_int_param = 2; 512 static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param; 513 #endif // KMP_DISPATCH_H 514