1 /* 2 * kmp_csupport.cpp -- kfront linkage support for OpenMP. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #define __KMP_IMP 14 #include "omp.h" /* extern "C" declarations of user-visible routines */ 15 #include "kmp.h" 16 #include "kmp_error.h" 17 #include "kmp_i18n.h" 18 #include "kmp_itt.h" 19 #include "kmp_lock.h" 20 #include "kmp_stats.h" 21 #include "ompt-specific.h" 22 23 #define MAX_MESSAGE 512 24 25 // flags will be used in future, e.g. to implement openmp_strict library 26 // restrictions 27 28 /*! 29 * @ingroup STARTUP_SHUTDOWN 30 * @param loc in source location information 31 * @param flags in for future use (currently ignored) 32 * 33 * Initialize the runtime library. This call is optional; if it is not made then 34 * it will be implicitly called by attempts to use other library functions. 35 */ 36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) { 37 // By default __kmpc_begin() is no-op. 38 char *env; 39 if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL && 40 __kmp_str_match_true(env)) { 41 __kmp_middle_initialize(); 42 KC_TRACE(10, ("__kmpc_begin: middle initialization called\n")); 43 } else if (__kmp_ignore_mppbeg() == FALSE) { 44 // By default __kmp_ignore_mppbeg() returns TRUE. 45 __kmp_internal_begin(); 46 KC_TRACE(10, ("__kmpc_begin: called\n")); 47 } 48 } 49 50 /*! 51 * @ingroup STARTUP_SHUTDOWN 52 * @param loc source location information 53 * 54 * Shutdown the runtime library. This is also optional, and even if called will 55 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to 56 * zero. 57 */ 58 void __kmpc_end(ident_t *loc) { 59 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() 60 // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND 61 // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() 62 // returns FALSE and __kmpc_end() will unregister this root (it can cause 63 // library shut down). 64 if (__kmp_ignore_mppend() == FALSE) { 65 KC_TRACE(10, ("__kmpc_end: called\n")); 66 KA_TRACE(30, ("__kmpc_end\n")); 67 68 __kmp_internal_end_thread(-1); 69 } 70 #if KMP_OS_WINDOWS && OMPT_SUPPORT 71 // Normal exit process on Windows does not allow worker threads of the final 72 // parallel region to finish reporting their events, so shutting down the 73 // library here fixes the issue at least for the cases where __kmpc_end() is 74 // placed properly. 75 if (ompt_enabled.enabled) 76 __kmp_internal_end_library(__kmp_gtid_get_specific()); 77 #endif 78 } 79 80 /*! 81 @ingroup THREAD_STATES 82 @param loc Source location information. 83 @return The global thread index of the active thread. 84 85 This function can be called in any context. 86 87 If the runtime has ony been entered at the outermost level from a 88 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is 89 that which would be returned by omp_get_thread_num() in the outermost 90 active parallel construct. (Or zero if there is no active parallel 91 construct, since the master thread is necessarily thread zero). 92 93 If multiple non-OpenMP threads all enter an OpenMP construct then this 94 will be a unique thread identifier among all the threads created by 95 the OpenMP runtime (but the value cannote be defined in terms of 96 OpenMP thread ids returned by omp_get_thread_num()). 97 */ 98 kmp_int32 __kmpc_global_thread_num(ident_t *loc) { 99 kmp_int32 gtid = __kmp_entry_gtid(); 100 101 KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid)); 102 103 return gtid; 104 } 105 106 /*! 107 @ingroup THREAD_STATES 108 @param loc Source location information. 109 @return The number of threads under control of the OpenMP<sup>*</sup> runtime 110 111 This function can be called in any context. 112 It returns the total number of threads under the control of the OpenMP runtime. 113 That is not a number that can be determined by any OpenMP standard calls, since 114 the library may be called from more than one non-OpenMP thread, and this 115 reflects the total over all such calls. Similarly the runtime maintains 116 underlying threads even when they are not active (since the cost of creating 117 and destroying OS threads is high), this call counts all such threads even if 118 they are not waiting for work. 119 */ 120 kmp_int32 __kmpc_global_num_threads(ident_t *loc) { 121 KC_TRACE(10, 122 ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth)); 123 124 return TCR_4(__kmp_all_nth); 125 } 126 127 /*! 128 @ingroup THREAD_STATES 129 @param loc Source location information. 130 @return The thread number of the calling thread in the innermost active parallel 131 construct. 132 */ 133 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) { 134 KC_TRACE(10, ("__kmpc_bound_thread_num: called\n")); 135 return __kmp_tid_from_gtid(__kmp_entry_gtid()); 136 } 137 138 /*! 139 @ingroup THREAD_STATES 140 @param loc Source location information. 141 @return The number of threads in the innermost active parallel construct. 142 */ 143 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) { 144 KC_TRACE(10, ("__kmpc_bound_num_threads: called\n")); 145 146 return __kmp_entry_thread()->th.th_team->t.t_nproc; 147 } 148 149 /*! 150 * @ingroup DEPRECATED 151 * @param loc location description 152 * 153 * This function need not be called. It always returns TRUE. 154 */ 155 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) { 156 #ifndef KMP_DEBUG 157 158 return TRUE; 159 160 #else 161 162 const char *semi2; 163 const char *semi3; 164 int line_no; 165 166 if (__kmp_par_range == 0) { 167 return TRUE; 168 } 169 semi2 = loc->psource; 170 if (semi2 == NULL) { 171 return TRUE; 172 } 173 semi2 = strchr(semi2, ';'); 174 if (semi2 == NULL) { 175 return TRUE; 176 } 177 semi2 = strchr(semi2 + 1, ';'); 178 if (semi2 == NULL) { 179 return TRUE; 180 } 181 if (__kmp_par_range_filename[0]) { 182 const char *name = semi2 - 1; 183 while ((name > loc->psource) && (*name != '/') && (*name != ';')) { 184 name--; 185 } 186 if ((*name == '/') || (*name == ';')) { 187 name++; 188 } 189 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) { 190 return __kmp_par_range < 0; 191 } 192 } 193 semi3 = strchr(semi2 + 1, ';'); 194 if (__kmp_par_range_routine[0]) { 195 if ((semi3 != NULL) && (semi3 > semi2) && 196 (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) { 197 return __kmp_par_range < 0; 198 } 199 } 200 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) { 201 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) { 202 return __kmp_par_range > 0; 203 } 204 return __kmp_par_range < 0; 205 } 206 return TRUE; 207 208 #endif /* KMP_DEBUG */ 209 } 210 211 /*! 212 @ingroup THREAD_STATES 213 @param loc Source location information. 214 @return 1 if this thread is executing inside an active parallel region, zero if 215 not. 216 */ 217 kmp_int32 __kmpc_in_parallel(ident_t *loc) { 218 return __kmp_entry_thread()->th.th_root->r.r_active; 219 } 220 221 /*! 222 @ingroup PARALLEL 223 @param loc source location information 224 @param global_tid global thread number 225 @param num_threads number of threads requested for this parallel construct 226 227 Set the number of threads to be used by the next fork spawned by this thread. 228 This call is only required if the parallel construct has a `num_threads` clause. 229 */ 230 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, 231 kmp_int32 num_threads) { 232 KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n", 233 global_tid, num_threads)); 234 235 __kmp_push_num_threads(loc, global_tid, num_threads); 236 } 237 238 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) { 239 KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n")); 240 241 /* the num_threads are automatically popped */ 242 } 243 244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, 245 kmp_int32 proc_bind) { 246 KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid, 247 proc_bind)); 248 249 __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind); 250 } 251 252 /*! 253 @ingroup PARALLEL 254 @param loc source location information 255 @param argc total number of arguments in the ellipsis 256 @param microtask pointer to callback routine consisting of outlined parallel 257 construct 258 @param ... pointers to shared variables that aren't global 259 260 Do the actual fork and call the microtask in the relevant number of threads. 261 */ 262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) { 263 int gtid = __kmp_entry_gtid(); 264 265 #if (KMP_STATS_ENABLED) 266 // If we were in a serial region, then stop the serial timer, record 267 // the event, and start parallel region timer 268 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 269 if (previous_state == stats_state_e::SERIAL_REGION) { 270 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead); 271 } else { 272 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead); 273 } 274 int inParallel = __kmpc_in_parallel(loc); 275 if (inParallel) { 276 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL); 277 } else { 278 KMP_COUNT_BLOCK(OMP_PARALLEL); 279 } 280 #endif 281 282 // maybe to save thr_state is enough here 283 { 284 va_list ap; 285 va_start(ap, microtask); 286 287 #if OMPT_SUPPORT 288 ompt_frame_t *ompt_frame; 289 if (ompt_enabled.enabled) { 290 kmp_info_t *master_th = __kmp_threads[gtid]; 291 kmp_team_t *parent_team = master_th->th.th_team; 292 ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info; 293 if (lwt) 294 ompt_frame = &(lwt->ompt_task_info.frame); 295 else { 296 int tid = __kmp_tid_from_gtid(gtid); 297 ompt_frame = &( 298 parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame); 299 } 300 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 301 OMPT_STORE_RETURN_ADDRESS(gtid); 302 } 303 #endif 304 305 #if INCLUDE_SSC_MARKS 306 SSC_MARK_FORKING(); 307 #endif 308 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 309 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task 310 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, 311 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 312 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 313 &ap 314 #else 315 ap 316 #endif 317 ); 318 #if INCLUDE_SSC_MARKS 319 SSC_MARK_JOINING(); 320 #endif 321 __kmp_join_call(loc, gtid 322 #if OMPT_SUPPORT 323 , 324 fork_context_intel 325 #endif 326 ); 327 328 va_end(ap); 329 } 330 331 #if KMP_STATS_ENABLED 332 if (previous_state == stats_state_e::SERIAL_REGION) { 333 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial); 334 } else { 335 KMP_POP_PARTITIONED_TIMER(); 336 } 337 #endif // KMP_STATS_ENABLED 338 } 339 340 /*! 341 @ingroup PARALLEL 342 @param loc source location information 343 @param global_tid global thread number 344 @param num_teams number of teams requested for the teams construct 345 @param num_threads number of threads per team requested for the teams construct 346 347 Set the number of teams to be used by the teams construct. 348 This call is only required if the teams construct has a `num_teams` clause 349 or a `thread_limit` clause (or both). 350 */ 351 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, 352 kmp_int32 num_teams, kmp_int32 num_threads) { 353 KA_TRACE(20, 354 ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n", 355 global_tid, num_teams, num_threads)); 356 357 __kmp_push_num_teams(loc, global_tid, num_teams, num_threads); 358 } 359 360 /*! 361 @ingroup PARALLEL 362 @param loc source location information 363 @param argc total number of arguments in the ellipsis 364 @param microtask pointer to callback routine consisting of outlined teams 365 construct 366 @param ... pointers to shared variables that aren't global 367 368 Do the actual fork and call the microtask in the relevant number of threads. 369 */ 370 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, 371 ...) { 372 int gtid = __kmp_entry_gtid(); 373 kmp_info_t *this_thr = __kmp_threads[gtid]; 374 va_list ap; 375 va_start(ap, microtask); 376 377 #if KMP_STATS_ENABLED 378 KMP_COUNT_BLOCK(OMP_TEAMS); 379 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 380 if (previous_state == stats_state_e::SERIAL_REGION) { 381 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead); 382 } else { 383 KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead); 384 } 385 #endif 386 387 // remember teams entry point and nesting level 388 this_thr->th.th_teams_microtask = microtask; 389 this_thr->th.th_teams_level = 390 this_thr->th.th_team->t.t_level; // AC: can be >0 on host 391 392 #if OMPT_SUPPORT 393 kmp_team_t *parent_team = this_thr->th.th_team; 394 int tid = __kmp_tid_from_gtid(gtid); 395 if (ompt_enabled.enabled) { 396 parent_team->t.t_implicit_task_taskdata[tid] 397 .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 398 } 399 OMPT_STORE_RETURN_ADDRESS(gtid); 400 #endif 401 402 // check if __kmpc_push_num_teams called, set default number of teams 403 // otherwise 404 if (this_thr->th.th_teams_size.nteams == 0) { 405 __kmp_push_num_teams(loc, gtid, 0, 0); 406 } 407 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1); 408 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1); 409 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1); 410 411 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 412 VOLATILE_CAST(microtask_t) 413 __kmp_teams_master, // "wrapped" task 414 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, 415 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 416 &ap 417 #else 418 ap 419 #endif 420 ); 421 __kmp_join_call(loc, gtid 422 #if OMPT_SUPPORT 423 , 424 fork_context_intel 425 #endif 426 ); 427 428 // Pop current CG root off list 429 KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots); 430 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 431 this_thr->th.th_cg_roots = tmp->up; 432 KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up" 433 " to node %p. cg_nthreads was %d\n", 434 this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads)); 435 KMP_DEBUG_ASSERT(tmp->cg_nthreads); 436 int i = tmp->cg_nthreads--; 437 if (i == 1) { // check is we are the last thread in CG (not always the case) 438 __kmp_free(tmp); 439 } 440 // Restore current task's thread_limit from CG root 441 KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots); 442 this_thr->th.th_current_task->td_icvs.thread_limit = 443 this_thr->th.th_cg_roots->cg_thread_limit; 444 445 this_thr->th.th_teams_microtask = NULL; 446 this_thr->th.th_teams_level = 0; 447 *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L; 448 va_end(ap); 449 #if KMP_STATS_ENABLED 450 if (previous_state == stats_state_e::SERIAL_REGION) { 451 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial); 452 } else { 453 KMP_POP_PARTITIONED_TIMER(); 454 } 455 #endif // KMP_STATS_ENABLED 456 } 457 458 // I don't think this function should ever have been exported. 459 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated 460 // openmp code ever called it, but it's been exported from the RTL for so 461 // long that I'm afraid to remove the definition. 462 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); } 463 464 /*! 465 @ingroup PARALLEL 466 @param loc source location information 467 @param global_tid global thread number 468 469 Enter a serialized parallel construct. This interface is used to handle a 470 conditional parallel region, like this, 471 @code 472 #pragma omp parallel if (condition) 473 @endcode 474 when the condition is false. 475 */ 476 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 477 // The implementation is now in kmp_runtime.cpp so that it can share static 478 // functions with kmp_fork_call since the tasks to be done are similar in 479 // each case. 480 #if OMPT_SUPPORT 481 OMPT_STORE_RETURN_ADDRESS(global_tid); 482 #endif 483 __kmp_serialized_parallel(loc, global_tid); 484 } 485 486 /*! 487 @ingroup PARALLEL 488 @param loc source location information 489 @param global_tid global thread number 490 491 Leave a serialized parallel construct. 492 */ 493 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 494 kmp_internal_control_t *top; 495 kmp_info_t *this_thr; 496 kmp_team_t *serial_team; 497 498 KC_TRACE(10, 499 ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid)); 500 501 /* skip all this code for autopar serialized loops since it results in 502 unacceptable overhead */ 503 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 504 return; 505 506 // Not autopar code 507 if (!TCR_4(__kmp_init_parallel)) 508 __kmp_parallel_initialize(); 509 510 __kmp_resume_if_soft_paused(); 511 512 this_thr = __kmp_threads[global_tid]; 513 serial_team = this_thr->th.th_serial_team; 514 515 kmp_task_team_t *task_team = this_thr->th.th_task_team; 516 // we need to wait for the proxy tasks before finishing the thread 517 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) 518 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL)); 519 520 KMP_MB(); 521 KMP_DEBUG_ASSERT(serial_team); 522 KMP_ASSERT(serial_team->t.t_serialized); 523 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 524 KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team); 525 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 526 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 527 528 #if OMPT_SUPPORT 529 if (ompt_enabled.enabled && 530 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 531 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none; 532 if (ompt_enabled.ompt_callback_implicit_task) { 533 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 534 ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1, 535 OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit); 536 } 537 538 // reset clear the task id only after unlinking the task 539 ompt_data_t *parent_task_data; 540 __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL); 541 542 if (ompt_enabled.ompt_callback_parallel_end) { 543 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 544 &(serial_team->t.ompt_team_info.parallel_data), parent_task_data, 545 ompt_parallel_invoker_program | ompt_parallel_team, 546 OMPT_LOAD_RETURN_ADDRESS(global_tid)); 547 } 548 __ompt_lw_taskteam_unlink(this_thr); 549 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 550 } 551 #endif 552 553 /* If necessary, pop the internal control stack values and replace the team 554 * values */ 555 top = serial_team->t.t_control_stack_top; 556 if (top && top->serial_nesting_level == serial_team->t.t_serialized) { 557 copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top); 558 serial_team->t.t_control_stack_top = top->next; 559 __kmp_free(top); 560 } 561 562 // if( serial_team -> t.t_serialized > 1 ) 563 serial_team->t.t_level--; 564 565 /* pop dispatch buffers stack */ 566 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer); 567 { 568 dispatch_private_info_t *disp_buffer = 569 serial_team->t.t_dispatch->th_disp_buffer; 570 serial_team->t.t_dispatch->th_disp_buffer = 571 serial_team->t.t_dispatch->th_disp_buffer->next; 572 __kmp_free(disp_buffer); 573 } 574 this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore 575 576 --serial_team->t.t_serialized; 577 if (serial_team->t.t_serialized == 0) { 578 579 /* return to the parallel section */ 580 581 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 582 if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) { 583 __kmp_clear_x87_fpu_status_word(); 584 __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word); 585 __kmp_load_mxcsr(&serial_team->t.t_mxcsr); 586 } 587 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 588 589 this_thr->th.th_team = serial_team->t.t_parent; 590 this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid; 591 592 /* restore values cached in the thread */ 593 this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */ 594 this_thr->th.th_team_master = 595 serial_team->t.t_parent->t.t_threads[0]; /* JPH */ 596 this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized; 597 598 /* TODO the below shouldn't need to be adjusted for serialized teams */ 599 this_thr->th.th_dispatch = 600 &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid]; 601 602 __kmp_pop_current_task_from_thread(this_thr); 603 604 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0); 605 this_thr->th.th_current_task->td_flags.executing = 1; 606 607 if (__kmp_tasking_mode != tskm_immediate_exec) { 608 // Copy the task team from the new child / old parent team to the thread. 609 this_thr->th.th_task_team = 610 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]; 611 KA_TRACE(20, 612 ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / " 613 "team %p\n", 614 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 615 } 616 } else { 617 if (__kmp_tasking_mode != tskm_immediate_exec) { 618 KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting " 619 "depth of serial team %p to %d\n", 620 global_tid, serial_team, serial_team->t.t_serialized)); 621 } 622 } 623 624 if (__kmp_env_consistency_check) 625 __kmp_pop_parallel(global_tid, NULL); 626 #if OMPT_SUPPORT 627 if (ompt_enabled.enabled) 628 this_thr->th.ompt_thread_info.state = 629 ((this_thr->th.th_team_serialized) ? ompt_state_work_serial 630 : ompt_state_work_parallel); 631 #endif 632 } 633 634 /*! 635 @ingroup SYNCHRONIZATION 636 @param loc source location information. 637 638 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though 639 depending on the memory ordering convention obeyed by the compiler 640 even that may not be necessary). 641 */ 642 void __kmpc_flush(ident_t *loc) { 643 KC_TRACE(10, ("__kmpc_flush: called\n")); 644 645 /* need explicit __mf() here since use volatile instead in library */ 646 KMP_MB(); /* Flush all pending memory write invalidates. */ 647 648 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64) 649 #if KMP_MIC 650 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used. 651 // We shouldn't need it, though, since the ABI rules require that 652 // * If the compiler generates NGO stores it also generates the fence 653 // * If users hand-code NGO stores they should insert the fence 654 // therefore no incomplete unordered stores should be visible. 655 #else 656 // C74404 657 // This is to address non-temporal store instructions (sfence needed). 658 // The clflush instruction is addressed either (mfence needed). 659 // Probably the non-temporal load monvtdqa instruction should also be 660 // addressed. 661 // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2. 662 if (!__kmp_cpuinfo.initialized) { 663 __kmp_query_cpuid(&__kmp_cpuinfo); 664 } 665 if (!__kmp_cpuinfo.sse2) { 666 // CPU cannot execute SSE2 instructions. 667 } else { 668 #if KMP_COMPILER_ICC 669 _mm_mfence(); 670 #elif KMP_COMPILER_MSVC 671 MemoryBarrier(); 672 #else 673 __sync_synchronize(); 674 #endif // KMP_COMPILER_ICC 675 } 676 #endif // KMP_MIC 677 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \ 678 KMP_ARCH_RISCV64) 679 // Nothing to see here move along 680 #elif KMP_ARCH_PPC64 681 // Nothing needed here (we have a real MB above). 682 #if KMP_OS_CNK 683 // The flushing thread needs to yield here; this prevents a 684 // busy-waiting thread from saturating the pipeline. flush is 685 // often used in loops like this: 686 // while (!flag) { 687 // #pragma omp flush(flag) 688 // } 689 // and adding the yield here is good for at least a 10x speedup 690 // when running >2 threads per core (on the NAS LU benchmark). 691 __kmp_yield(); 692 #endif 693 #else 694 #error Unknown or unsupported architecture 695 #endif 696 697 #if OMPT_SUPPORT && OMPT_OPTIONAL 698 if (ompt_enabled.ompt_callback_flush) { 699 ompt_callbacks.ompt_callback(ompt_callback_flush)( 700 __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0)); 701 } 702 #endif 703 } 704 705 /* -------------------------------------------------------------------------- */ 706 /*! 707 @ingroup SYNCHRONIZATION 708 @param loc source location information 709 @param global_tid thread id. 710 711 Execute a barrier. 712 */ 713 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) { 714 KMP_COUNT_BLOCK(OMP_BARRIER); 715 KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid)); 716 717 if (!TCR_4(__kmp_init_parallel)) 718 __kmp_parallel_initialize(); 719 720 __kmp_resume_if_soft_paused(); 721 722 if (__kmp_env_consistency_check) { 723 if (loc == 0) { 724 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 725 } 726 __kmp_check_barrier(global_tid, ct_barrier, loc); 727 } 728 729 #if OMPT_SUPPORT 730 ompt_frame_t *ompt_frame; 731 if (ompt_enabled.enabled) { 732 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 733 if (ompt_frame->enter_frame.ptr == NULL) 734 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 735 OMPT_STORE_RETURN_ADDRESS(global_tid); 736 } 737 #endif 738 __kmp_threads[global_tid]->th.th_ident = loc; 739 // TODO: explicit barrier_wait_id: 740 // this function is called when 'barrier' directive is present or 741 // implicit barrier at the end of a worksharing construct. 742 // 1) better to add a per-thread barrier counter to a thread data structure 743 // 2) set to 0 when a new team is created 744 // 4) no sync is required 745 746 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 747 #if OMPT_SUPPORT && OMPT_OPTIONAL 748 if (ompt_enabled.enabled) { 749 ompt_frame->enter_frame = ompt_data_none; 750 } 751 #endif 752 } 753 754 /* The BARRIER for a MASTER section is always explicit */ 755 /*! 756 @ingroup WORK_SHARING 757 @param loc source location information. 758 @param global_tid global thread number . 759 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise. 760 */ 761 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) { 762 int status = 0; 763 764 KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid)); 765 766 if (!TCR_4(__kmp_init_parallel)) 767 __kmp_parallel_initialize(); 768 769 __kmp_resume_if_soft_paused(); 770 771 if (KMP_MASTER_GTID(global_tid)) { 772 KMP_COUNT_BLOCK(OMP_MASTER); 773 KMP_PUSH_PARTITIONED_TIMER(OMP_master); 774 status = 1; 775 } 776 777 #if OMPT_SUPPORT && OMPT_OPTIONAL 778 if (status) { 779 if (ompt_enabled.ompt_callback_master) { 780 kmp_info_t *this_thr = __kmp_threads[global_tid]; 781 kmp_team_t *team = this_thr->th.th_team; 782 783 int tid = __kmp_tid_from_gtid(global_tid); 784 ompt_callbacks.ompt_callback(ompt_callback_master)( 785 ompt_scope_begin, &(team->t.ompt_team_info.parallel_data), 786 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 787 OMPT_GET_RETURN_ADDRESS(0)); 788 } 789 } 790 #endif 791 792 if (__kmp_env_consistency_check) { 793 #if KMP_USE_DYNAMIC_LOCK 794 if (status) 795 __kmp_push_sync(global_tid, ct_master, loc, NULL, 0); 796 else 797 __kmp_check_sync(global_tid, ct_master, loc, NULL, 0); 798 #else 799 if (status) 800 __kmp_push_sync(global_tid, ct_master, loc, NULL); 801 else 802 __kmp_check_sync(global_tid, ct_master, loc, NULL); 803 #endif 804 } 805 806 return status; 807 } 808 809 /*! 810 @ingroup WORK_SHARING 811 @param loc source location information. 812 @param global_tid global thread number . 813 814 Mark the end of a <tt>master</tt> region. This should only be called by the 815 thread that executes the <tt>master</tt> region. 816 */ 817 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) { 818 KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid)); 819 820 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid)); 821 KMP_POP_PARTITIONED_TIMER(); 822 823 #if OMPT_SUPPORT && OMPT_OPTIONAL 824 kmp_info_t *this_thr = __kmp_threads[global_tid]; 825 kmp_team_t *team = this_thr->th.th_team; 826 if (ompt_enabled.ompt_callback_master) { 827 int tid = __kmp_tid_from_gtid(global_tid); 828 ompt_callbacks.ompt_callback(ompt_callback_master)( 829 ompt_scope_end, &(team->t.ompt_team_info.parallel_data), 830 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 831 OMPT_GET_RETURN_ADDRESS(0)); 832 } 833 #endif 834 835 if (__kmp_env_consistency_check) { 836 if (global_tid < 0) 837 KMP_WARNING(ThreadIdentInvalid); 838 839 if (KMP_MASTER_GTID(global_tid)) 840 __kmp_pop_sync(global_tid, ct_master, loc); 841 } 842 } 843 844 /*! 845 @ingroup WORK_SHARING 846 @param loc source location information. 847 @param gtid global thread number. 848 849 Start execution of an <tt>ordered</tt> construct. 850 */ 851 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) { 852 int cid = 0; 853 kmp_info_t *th; 854 KMP_DEBUG_ASSERT(__kmp_init_serial); 855 856 KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid)); 857 858 if (!TCR_4(__kmp_init_parallel)) 859 __kmp_parallel_initialize(); 860 861 __kmp_resume_if_soft_paused(); 862 863 #if USE_ITT_BUILD 864 __kmp_itt_ordered_prep(gtid); 865 // TODO: ordered_wait_id 866 #endif /* USE_ITT_BUILD */ 867 868 th = __kmp_threads[gtid]; 869 870 #if OMPT_SUPPORT && OMPT_OPTIONAL 871 kmp_team_t *team; 872 ompt_wait_id_t lck; 873 void *codeptr_ra; 874 if (ompt_enabled.enabled) { 875 OMPT_STORE_RETURN_ADDRESS(gtid); 876 team = __kmp_team_from_gtid(gtid); 877 lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value; 878 /* OMPT state update */ 879 th->th.ompt_thread_info.wait_id = lck; 880 th->th.ompt_thread_info.state = ompt_state_wait_ordered; 881 882 /* OMPT event callback */ 883 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid); 884 if (ompt_enabled.ompt_callback_mutex_acquire) { 885 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 886 ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck, 887 codeptr_ra); 888 } 889 } 890 #endif 891 892 if (th->th.th_dispatch->th_deo_fcn != 0) 893 (*th->th.th_dispatch->th_deo_fcn)(>id, &cid, loc); 894 else 895 __kmp_parallel_deo(>id, &cid, loc); 896 897 #if OMPT_SUPPORT && OMPT_OPTIONAL 898 if (ompt_enabled.enabled) { 899 /* OMPT state update */ 900 th->th.ompt_thread_info.state = ompt_state_work_parallel; 901 th->th.ompt_thread_info.wait_id = 0; 902 903 /* OMPT event callback */ 904 if (ompt_enabled.ompt_callback_mutex_acquired) { 905 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 906 ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra); 907 } 908 } 909 #endif 910 911 #if USE_ITT_BUILD 912 __kmp_itt_ordered_start(gtid); 913 #endif /* USE_ITT_BUILD */ 914 } 915 916 /*! 917 @ingroup WORK_SHARING 918 @param loc source location information. 919 @param gtid global thread number. 920 921 End execution of an <tt>ordered</tt> construct. 922 */ 923 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) { 924 int cid = 0; 925 kmp_info_t *th; 926 927 KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid)); 928 929 #if USE_ITT_BUILD 930 __kmp_itt_ordered_end(gtid); 931 // TODO: ordered_wait_id 932 #endif /* USE_ITT_BUILD */ 933 934 th = __kmp_threads[gtid]; 935 936 if (th->th.th_dispatch->th_dxo_fcn != 0) 937 (*th->th.th_dispatch->th_dxo_fcn)(>id, &cid, loc); 938 else 939 __kmp_parallel_dxo(>id, &cid, loc); 940 941 #if OMPT_SUPPORT && OMPT_OPTIONAL 942 OMPT_STORE_RETURN_ADDRESS(gtid); 943 if (ompt_enabled.ompt_callback_mutex_released) { 944 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 945 ompt_mutex_ordered, 946 (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid) 947 ->t.t_ordered.dt.t_value, 948 OMPT_LOAD_RETURN_ADDRESS(gtid)); 949 } 950 #endif 951 } 952 953 #if KMP_USE_DYNAMIC_LOCK 954 955 static __forceinline void 956 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc, 957 kmp_int32 gtid, kmp_indirect_locktag_t tag) { 958 // Pointer to the allocated indirect lock is written to crit, while indexing 959 // is ignored. 960 void *idx; 961 kmp_indirect_lock_t **lck; 962 lck = (kmp_indirect_lock_t **)crit; 963 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag); 964 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock); 965 KMP_SET_I_LOCK_LOCATION(ilk, loc); 966 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section); 967 KA_TRACE(20, 968 ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag)); 969 #if USE_ITT_BUILD 970 __kmp_itt_critical_creating(ilk->lock, loc); 971 #endif 972 int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk); 973 if (status == 0) { 974 #if USE_ITT_BUILD 975 __kmp_itt_critical_destroyed(ilk->lock); 976 #endif 977 // We don't really need to destroy the unclaimed lock here since it will be 978 // cleaned up at program exit. 979 // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx); 980 } 981 KMP_DEBUG_ASSERT(*lck != NULL); 982 } 983 984 // Fast-path acquire tas lock 985 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \ 986 { \ 987 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 988 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \ 989 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \ 990 if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \ 991 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \ 992 kmp_uint32 spins; \ 993 KMP_FSYNC_PREPARE(l); \ 994 KMP_INIT_YIELD(spins); \ 995 kmp_backoff_t backoff = __kmp_spin_backoff_params; \ 996 do { \ 997 if (TCR_4(__kmp_nth) > \ 998 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 999 KMP_YIELD(TRUE); \ 1000 } else { \ 1001 KMP_YIELD_SPIN(spins); \ 1002 } \ 1003 __kmp_spin_backoff(&backoff); \ 1004 } while ( \ 1005 KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \ 1006 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)); \ 1007 } \ 1008 KMP_FSYNC_ACQUIRED(l); \ 1009 } 1010 1011 // Fast-path test tas lock 1012 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \ 1013 { \ 1014 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 1015 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \ 1016 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \ 1017 rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free && \ 1018 __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy); \ 1019 } 1020 1021 // Fast-path release tas lock 1022 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \ 1023 { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); } 1024 1025 #if KMP_USE_FUTEX 1026 1027 #include <sys/syscall.h> 1028 #include <unistd.h> 1029 #ifndef FUTEX_WAIT 1030 #define FUTEX_WAIT 0 1031 #endif 1032 #ifndef FUTEX_WAKE 1033 #define FUTEX_WAKE 1 1034 #endif 1035 1036 // Fast-path acquire futex lock 1037 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \ 1038 { \ 1039 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1040 kmp_int32 gtid_code = (gtid + 1) << 1; \ 1041 KMP_MB(); \ 1042 KMP_FSYNC_PREPARE(ftx); \ 1043 kmp_int32 poll_val; \ 1044 while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \ 1045 &(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 1046 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \ 1047 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \ 1048 if (!cond) { \ 1049 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \ 1050 poll_val | \ 1051 KMP_LOCK_BUSY(1, futex))) { \ 1052 continue; \ 1053 } \ 1054 poll_val |= KMP_LOCK_BUSY(1, futex); \ 1055 } \ 1056 kmp_int32 rc; \ 1057 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \ 1058 NULL, NULL, 0)) != 0) { \ 1059 continue; \ 1060 } \ 1061 gtid_code |= 1; \ 1062 } \ 1063 KMP_FSYNC_ACQUIRED(ftx); \ 1064 } 1065 1066 // Fast-path test futex lock 1067 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \ 1068 { \ 1069 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1070 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 1071 KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \ 1072 KMP_FSYNC_ACQUIRED(ftx); \ 1073 rc = TRUE; \ 1074 } else { \ 1075 rc = FALSE; \ 1076 } \ 1077 } 1078 1079 // Fast-path release futex lock 1080 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \ 1081 { \ 1082 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1083 KMP_MB(); \ 1084 KMP_FSYNC_RELEASING(ftx); \ 1085 kmp_int32 poll_val = \ 1086 KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \ 1087 if (KMP_LOCK_STRIP(poll_val) & 1) { \ 1088 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \ 1089 KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \ 1090 } \ 1091 KMP_MB(); \ 1092 KMP_YIELD_OVERSUB(); \ 1093 } 1094 1095 #endif // KMP_USE_FUTEX 1096 1097 #else // KMP_USE_DYNAMIC_LOCK 1098 1099 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit, 1100 ident_t const *loc, 1101 kmp_int32 gtid) { 1102 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit; 1103 1104 // Because of the double-check, the following load doesn't need to be volatile 1105 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1106 1107 if (lck == NULL) { 1108 void *idx; 1109 1110 // Allocate & initialize the lock. 1111 // Remember alloc'ed locks in table in order to free them in __kmp_cleanup() 1112 lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section); 1113 __kmp_init_user_lock_with_checks(lck); 1114 __kmp_set_user_lock_location(lck, loc); 1115 #if USE_ITT_BUILD 1116 __kmp_itt_critical_creating(lck); 1117 // __kmp_itt_critical_creating() should be called *before* the first usage 1118 // of underlying lock. It is the only place where we can guarantee it. There 1119 // are chances the lock will destroyed with no usage, but it is not a 1120 // problem, because this is not real event seen by user but rather setting 1121 // name for object (lock). See more details in kmp_itt.h. 1122 #endif /* USE_ITT_BUILD */ 1123 1124 // Use a cmpxchg instruction to slam the start of the critical section with 1125 // the lock pointer. If another thread beat us to it, deallocate the lock, 1126 // and use the lock that the other thread allocated. 1127 int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck); 1128 1129 if (status == 0) { 1130 // Deallocate the lock and reload the value. 1131 #if USE_ITT_BUILD 1132 __kmp_itt_critical_destroyed(lck); 1133 // Let ITT know the lock is destroyed and the same memory location may be reused 1134 // for another purpose. 1135 #endif /* USE_ITT_BUILD */ 1136 __kmp_destroy_user_lock_with_checks(lck); 1137 __kmp_user_lock_free(&idx, gtid, lck); 1138 lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1139 KMP_DEBUG_ASSERT(lck != NULL); 1140 } 1141 } 1142 return lck; 1143 } 1144 1145 #endif // KMP_USE_DYNAMIC_LOCK 1146 1147 /*! 1148 @ingroup WORK_SHARING 1149 @param loc source location information. 1150 @param global_tid global thread number . 1151 @param crit identity of the critical section. This could be a pointer to a lock 1152 associated with the critical section, or some other suitably unique value. 1153 1154 Enter code protected by a `critical` construct. 1155 This function blocks until the executing thread can enter the critical section. 1156 */ 1157 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, 1158 kmp_critical_name *crit) { 1159 #if KMP_USE_DYNAMIC_LOCK 1160 #if OMPT_SUPPORT && OMPT_OPTIONAL 1161 OMPT_STORE_RETURN_ADDRESS(global_tid); 1162 #endif // OMPT_SUPPORT 1163 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none); 1164 #else 1165 KMP_COUNT_BLOCK(OMP_CRITICAL); 1166 #if OMPT_SUPPORT && OMPT_OPTIONAL 1167 ompt_state_t prev_state = ompt_state_undefined; 1168 ompt_thread_info_t ti; 1169 #endif 1170 kmp_user_lock_p lck; 1171 1172 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1173 1174 // TODO: add THR_OVHD_STATE 1175 1176 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait); 1177 KMP_CHECK_USER_LOCK_INIT(); 1178 1179 if ((__kmp_user_lock_kind == lk_tas) && 1180 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1181 lck = (kmp_user_lock_p)crit; 1182 } 1183 #if KMP_USE_FUTEX 1184 else if ((__kmp_user_lock_kind == lk_futex) && 1185 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1186 lck = (kmp_user_lock_p)crit; 1187 } 1188 #endif 1189 else { // ticket, queuing or drdpa 1190 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 1191 } 1192 1193 if (__kmp_env_consistency_check) 1194 __kmp_push_sync(global_tid, ct_critical, loc, lck); 1195 1196 // since the critical directive binds to all threads, not just the current 1197 // team we have to check this even if we are in a serialized team. 1198 // also, even if we are the uber thread, we still have to conduct the lock, 1199 // as we have to contend with sibling threads. 1200 1201 #if USE_ITT_BUILD 1202 __kmp_itt_critical_acquiring(lck); 1203 #endif /* USE_ITT_BUILD */ 1204 #if OMPT_SUPPORT && OMPT_OPTIONAL 1205 OMPT_STORE_RETURN_ADDRESS(gtid); 1206 void *codeptr_ra = NULL; 1207 if (ompt_enabled.enabled) { 1208 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1209 /* OMPT state update */ 1210 prev_state = ti.state; 1211 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck; 1212 ti.state = ompt_state_wait_critical; 1213 1214 /* OMPT event callback */ 1215 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid); 1216 if (ompt_enabled.ompt_callback_mutex_acquire) { 1217 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1218 ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 1219 (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra); 1220 } 1221 } 1222 #endif 1223 // Value of 'crit' should be good for using as a critical_id of the critical 1224 // section directive. 1225 __kmp_acquire_user_lock_with_checks(lck, global_tid); 1226 1227 #if USE_ITT_BUILD 1228 __kmp_itt_critical_acquired(lck); 1229 #endif /* USE_ITT_BUILD */ 1230 #if OMPT_SUPPORT && OMPT_OPTIONAL 1231 if (ompt_enabled.enabled) { 1232 /* OMPT state update */ 1233 ti.state = prev_state; 1234 ti.wait_id = 0; 1235 1236 /* OMPT event callback */ 1237 if (ompt_enabled.ompt_callback_mutex_acquired) { 1238 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 1239 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra); 1240 } 1241 } 1242 #endif 1243 KMP_POP_PARTITIONED_TIMER(); 1244 1245 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1246 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1247 #endif // KMP_USE_DYNAMIC_LOCK 1248 } 1249 1250 #if KMP_USE_DYNAMIC_LOCK 1251 1252 // Converts the given hint to an internal lock implementation 1253 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) { 1254 #if KMP_USE_TSX 1255 #define KMP_TSX_LOCK(seq) lockseq_##seq 1256 #else 1257 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq 1258 #endif 1259 1260 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1261 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm) 1262 #else 1263 #define KMP_CPUINFO_RTM 0 1264 #endif 1265 1266 // Hints that do not require further logic 1267 if (hint & kmp_lock_hint_hle) 1268 return KMP_TSX_LOCK(hle); 1269 if (hint & kmp_lock_hint_rtm) 1270 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq; 1271 if (hint & kmp_lock_hint_adaptive) 1272 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq; 1273 1274 // Rule out conflicting hints first by returning the default lock 1275 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended)) 1276 return __kmp_user_lock_seq; 1277 if ((hint & omp_lock_hint_speculative) && 1278 (hint & omp_lock_hint_nonspeculative)) 1279 return __kmp_user_lock_seq; 1280 1281 // Do not even consider speculation when it appears to be contended 1282 if (hint & omp_lock_hint_contended) 1283 return lockseq_queuing; 1284 1285 // Uncontended lock without speculation 1286 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative)) 1287 return lockseq_tas; 1288 1289 // HLE lock for speculation 1290 if (hint & omp_lock_hint_speculative) 1291 return KMP_TSX_LOCK(hle); 1292 1293 return __kmp_user_lock_seq; 1294 } 1295 1296 #if OMPT_SUPPORT && OMPT_OPTIONAL 1297 #if KMP_USE_DYNAMIC_LOCK 1298 static kmp_mutex_impl_t 1299 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) { 1300 if (user_lock) { 1301 switch (KMP_EXTRACT_D_TAG(user_lock)) { 1302 case 0: 1303 break; 1304 #if KMP_USE_FUTEX 1305 case locktag_futex: 1306 return kmp_mutex_impl_queuing; 1307 #endif 1308 case locktag_tas: 1309 return kmp_mutex_impl_spin; 1310 #if KMP_USE_TSX 1311 case locktag_hle: 1312 return kmp_mutex_impl_speculative; 1313 #endif 1314 default: 1315 return kmp_mutex_impl_none; 1316 } 1317 ilock = KMP_LOOKUP_I_LOCK(user_lock); 1318 } 1319 KMP_ASSERT(ilock); 1320 switch (ilock->type) { 1321 #if KMP_USE_TSX 1322 case locktag_adaptive: 1323 case locktag_rtm: 1324 return kmp_mutex_impl_speculative; 1325 #endif 1326 case locktag_nested_tas: 1327 return kmp_mutex_impl_spin; 1328 #if KMP_USE_FUTEX 1329 case locktag_nested_futex: 1330 #endif 1331 case locktag_ticket: 1332 case locktag_queuing: 1333 case locktag_drdpa: 1334 case locktag_nested_ticket: 1335 case locktag_nested_queuing: 1336 case locktag_nested_drdpa: 1337 return kmp_mutex_impl_queuing; 1338 default: 1339 return kmp_mutex_impl_none; 1340 } 1341 } 1342 #else 1343 // For locks without dynamic binding 1344 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() { 1345 switch (__kmp_user_lock_kind) { 1346 case lk_tas: 1347 return kmp_mutex_impl_spin; 1348 #if KMP_USE_FUTEX 1349 case lk_futex: 1350 #endif 1351 case lk_ticket: 1352 case lk_queuing: 1353 case lk_drdpa: 1354 return kmp_mutex_impl_queuing; 1355 #if KMP_USE_TSX 1356 case lk_hle: 1357 case lk_rtm: 1358 case lk_adaptive: 1359 return kmp_mutex_impl_speculative; 1360 #endif 1361 default: 1362 return kmp_mutex_impl_none; 1363 } 1364 } 1365 #endif // KMP_USE_DYNAMIC_LOCK 1366 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1367 1368 /*! 1369 @ingroup WORK_SHARING 1370 @param loc source location information. 1371 @param global_tid global thread number. 1372 @param crit identity of the critical section. This could be a pointer to a lock 1373 associated with the critical section, or some other suitably unique value. 1374 @param hint the lock hint. 1375 1376 Enter code protected by a `critical` construct with a hint. The hint value is 1377 used to suggest a lock implementation. This function blocks until the executing 1378 thread can enter the critical section unless the hint suggests use of 1379 speculative execution and the hardware supports it. 1380 */ 1381 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid, 1382 kmp_critical_name *crit, uint32_t hint) { 1383 KMP_COUNT_BLOCK(OMP_CRITICAL); 1384 kmp_user_lock_p lck; 1385 #if OMPT_SUPPORT && OMPT_OPTIONAL 1386 ompt_state_t prev_state = ompt_state_undefined; 1387 ompt_thread_info_t ti; 1388 // This is the case, if called from __kmpc_critical: 1389 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1390 if (!codeptr) 1391 codeptr = OMPT_GET_RETURN_ADDRESS(0); 1392 #endif 1393 1394 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1395 1396 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 1397 // Check if it is initialized. 1398 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait); 1399 if (*lk == 0) { 1400 kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint); 1401 if (KMP_IS_D_LOCK(lckseq)) { 1402 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 1403 KMP_GET_D_TAG(lckseq)); 1404 } else { 1405 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq)); 1406 } 1407 } 1408 // Branch for accessing the actual lock object and set operation. This 1409 // branching is inevitable since this lock initialization does not follow the 1410 // normal dispatch path (lock table is not used). 1411 if (KMP_EXTRACT_D_TAG(lk) != 0) { 1412 lck = (kmp_user_lock_p)lk; 1413 if (__kmp_env_consistency_check) { 1414 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1415 __kmp_map_hint_to_lock(hint)); 1416 } 1417 #if USE_ITT_BUILD 1418 __kmp_itt_critical_acquiring(lck); 1419 #endif 1420 #if OMPT_SUPPORT && OMPT_OPTIONAL 1421 if (ompt_enabled.enabled) { 1422 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1423 /* OMPT state update */ 1424 prev_state = ti.state; 1425 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck; 1426 ti.state = ompt_state_wait_critical; 1427 1428 /* OMPT event callback */ 1429 if (ompt_enabled.ompt_callback_mutex_acquire) { 1430 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1431 ompt_mutex_critical, (unsigned int)hint, 1432 __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck, 1433 codeptr); 1434 } 1435 } 1436 #endif 1437 #if KMP_USE_INLINED_TAS 1438 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1439 KMP_ACQUIRE_TAS_LOCK(lck, global_tid); 1440 } else 1441 #elif KMP_USE_INLINED_FUTEX 1442 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1443 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid); 1444 } else 1445 #endif 1446 { 1447 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 1448 } 1449 } else { 1450 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 1451 lck = ilk->lock; 1452 if (__kmp_env_consistency_check) { 1453 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1454 __kmp_map_hint_to_lock(hint)); 1455 } 1456 #if USE_ITT_BUILD 1457 __kmp_itt_critical_acquiring(lck); 1458 #endif 1459 #if OMPT_SUPPORT && OMPT_OPTIONAL 1460 if (ompt_enabled.enabled) { 1461 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1462 /* OMPT state update */ 1463 prev_state = ti.state; 1464 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck; 1465 ti.state = ompt_state_wait_critical; 1466 1467 /* OMPT event callback */ 1468 if (ompt_enabled.ompt_callback_mutex_acquire) { 1469 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1470 ompt_mutex_critical, (unsigned int)hint, 1471 __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck, 1472 codeptr); 1473 } 1474 } 1475 #endif 1476 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 1477 } 1478 KMP_POP_PARTITIONED_TIMER(); 1479 1480 #if USE_ITT_BUILD 1481 __kmp_itt_critical_acquired(lck); 1482 #endif /* USE_ITT_BUILD */ 1483 #if OMPT_SUPPORT && OMPT_OPTIONAL 1484 if (ompt_enabled.enabled) { 1485 /* OMPT state update */ 1486 ti.state = prev_state; 1487 ti.wait_id = 0; 1488 1489 /* OMPT event callback */ 1490 if (ompt_enabled.ompt_callback_mutex_acquired) { 1491 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 1492 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 1493 } 1494 } 1495 #endif 1496 1497 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1498 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1499 } // __kmpc_critical_with_hint 1500 1501 #endif // KMP_USE_DYNAMIC_LOCK 1502 1503 /*! 1504 @ingroup WORK_SHARING 1505 @param loc source location information. 1506 @param global_tid global thread number . 1507 @param crit identity of the critical section. This could be a pointer to a lock 1508 associated with the critical section, or some other suitably unique value. 1509 1510 Leave a critical section, releasing any lock that was held during its execution. 1511 */ 1512 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, 1513 kmp_critical_name *crit) { 1514 kmp_user_lock_p lck; 1515 1516 KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid)); 1517 1518 #if KMP_USE_DYNAMIC_LOCK 1519 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 1520 lck = (kmp_user_lock_p)crit; 1521 KMP_ASSERT(lck != NULL); 1522 if (__kmp_env_consistency_check) { 1523 __kmp_pop_sync(global_tid, ct_critical, loc); 1524 } 1525 #if USE_ITT_BUILD 1526 __kmp_itt_critical_releasing(lck); 1527 #endif 1528 #if KMP_USE_INLINED_TAS 1529 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1530 KMP_RELEASE_TAS_LOCK(lck, global_tid); 1531 } else 1532 #elif KMP_USE_INLINED_FUTEX 1533 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1534 KMP_RELEASE_FUTEX_LOCK(lck, global_tid); 1535 } else 1536 #endif 1537 { 1538 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 1539 } 1540 } else { 1541 kmp_indirect_lock_t *ilk = 1542 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 1543 KMP_ASSERT(ilk != NULL); 1544 lck = ilk->lock; 1545 if (__kmp_env_consistency_check) { 1546 __kmp_pop_sync(global_tid, ct_critical, loc); 1547 } 1548 #if USE_ITT_BUILD 1549 __kmp_itt_critical_releasing(lck); 1550 #endif 1551 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid); 1552 } 1553 1554 #else // KMP_USE_DYNAMIC_LOCK 1555 1556 if ((__kmp_user_lock_kind == lk_tas) && 1557 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1558 lck = (kmp_user_lock_p)crit; 1559 } 1560 #if KMP_USE_FUTEX 1561 else if ((__kmp_user_lock_kind == lk_futex) && 1562 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1563 lck = (kmp_user_lock_p)crit; 1564 } 1565 #endif 1566 else { // ticket, queuing or drdpa 1567 lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit)); 1568 } 1569 1570 KMP_ASSERT(lck != NULL); 1571 1572 if (__kmp_env_consistency_check) 1573 __kmp_pop_sync(global_tid, ct_critical, loc); 1574 1575 #if USE_ITT_BUILD 1576 __kmp_itt_critical_releasing(lck); 1577 #endif /* USE_ITT_BUILD */ 1578 // Value of 'crit' should be good for using as a critical_id of the critical 1579 // section directive. 1580 __kmp_release_user_lock_with_checks(lck, global_tid); 1581 1582 #endif // KMP_USE_DYNAMIC_LOCK 1583 1584 #if OMPT_SUPPORT && OMPT_OPTIONAL 1585 /* OMPT release event triggers after lock is released; place here to trigger 1586 * for all #if branches */ 1587 OMPT_STORE_RETURN_ADDRESS(global_tid); 1588 if (ompt_enabled.ompt_callback_mutex_released) { 1589 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 1590 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, 1591 OMPT_LOAD_RETURN_ADDRESS(0)); 1592 } 1593 #endif 1594 1595 KMP_POP_PARTITIONED_TIMER(); 1596 KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid)); 1597 } 1598 1599 /*! 1600 @ingroup SYNCHRONIZATION 1601 @param loc source location information 1602 @param global_tid thread id. 1603 @return one if the thread should execute the master block, zero otherwise 1604 1605 Start execution of a combined barrier and master. The barrier is executed inside 1606 this function. 1607 */ 1608 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1609 int status; 1610 1611 KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid)); 1612 1613 if (!TCR_4(__kmp_init_parallel)) 1614 __kmp_parallel_initialize(); 1615 1616 __kmp_resume_if_soft_paused(); 1617 1618 if (__kmp_env_consistency_check) 1619 __kmp_check_barrier(global_tid, ct_barrier, loc); 1620 1621 #if OMPT_SUPPORT 1622 ompt_frame_t *ompt_frame; 1623 if (ompt_enabled.enabled) { 1624 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1625 if (ompt_frame->enter_frame.ptr == NULL) 1626 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1627 OMPT_STORE_RETURN_ADDRESS(global_tid); 1628 } 1629 #endif 1630 #if USE_ITT_NOTIFY 1631 __kmp_threads[global_tid]->th.th_ident = loc; 1632 #endif 1633 status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL); 1634 #if OMPT_SUPPORT && OMPT_OPTIONAL 1635 if (ompt_enabled.enabled) { 1636 ompt_frame->enter_frame = ompt_data_none; 1637 } 1638 #endif 1639 1640 return (status != 0) ? 0 : 1; 1641 } 1642 1643 /*! 1644 @ingroup SYNCHRONIZATION 1645 @param loc source location information 1646 @param global_tid thread id. 1647 1648 Complete the execution of a combined barrier and master. This function should 1649 only be called at the completion of the <tt>master</tt> code. Other threads will 1650 still be waiting at the barrier and this call releases them. 1651 */ 1652 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1653 KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid)); 1654 1655 __kmp_end_split_barrier(bs_plain_barrier, global_tid); 1656 } 1657 1658 /*! 1659 @ingroup SYNCHRONIZATION 1660 @param loc source location information 1661 @param global_tid thread id. 1662 @return one if the thread should execute the master block, zero otherwise 1663 1664 Start execution of a combined barrier and master(nowait) construct. 1665 The barrier is executed inside this function. 1666 There is no equivalent "end" function, since the 1667 */ 1668 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) { 1669 kmp_int32 ret; 1670 1671 KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid)); 1672 1673 if (!TCR_4(__kmp_init_parallel)) 1674 __kmp_parallel_initialize(); 1675 1676 __kmp_resume_if_soft_paused(); 1677 1678 if (__kmp_env_consistency_check) { 1679 if (loc == 0) { 1680 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 1681 } 1682 __kmp_check_barrier(global_tid, ct_barrier, loc); 1683 } 1684 1685 #if OMPT_SUPPORT 1686 ompt_frame_t *ompt_frame; 1687 if (ompt_enabled.enabled) { 1688 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1689 if (ompt_frame->enter_frame.ptr == NULL) 1690 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1691 OMPT_STORE_RETURN_ADDRESS(global_tid); 1692 } 1693 #endif 1694 #if USE_ITT_NOTIFY 1695 __kmp_threads[global_tid]->th.th_ident = loc; 1696 #endif 1697 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 1698 #if OMPT_SUPPORT && OMPT_OPTIONAL 1699 if (ompt_enabled.enabled) { 1700 ompt_frame->enter_frame = ompt_data_none; 1701 } 1702 #endif 1703 1704 ret = __kmpc_master(loc, global_tid); 1705 1706 if (__kmp_env_consistency_check) { 1707 /* there's no __kmpc_end_master called; so the (stats) */ 1708 /* actions of __kmpc_end_master are done here */ 1709 1710 if (global_tid < 0) { 1711 KMP_WARNING(ThreadIdentInvalid); 1712 } 1713 if (ret) { 1714 /* only one thread should do the pop since only */ 1715 /* one did the push (see __kmpc_master()) */ 1716 1717 __kmp_pop_sync(global_tid, ct_master, loc); 1718 } 1719 } 1720 1721 return (ret); 1722 } 1723 1724 /* The BARRIER for a SINGLE process section is always explicit */ 1725 /*! 1726 @ingroup WORK_SHARING 1727 @param loc source location information 1728 @param global_tid global thread number 1729 @return One if this thread should execute the single construct, zero otherwise. 1730 1731 Test whether to execute a <tt>single</tt> construct. 1732 There are no implicit barriers in the two "single" calls, rather the compiler 1733 should introduce an explicit barrier if it is required. 1734 */ 1735 1736 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) { 1737 kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE); 1738 1739 if (rc) { 1740 // We are going to execute the single statement, so we should count it. 1741 KMP_COUNT_BLOCK(OMP_SINGLE); 1742 KMP_PUSH_PARTITIONED_TIMER(OMP_single); 1743 } 1744 1745 #if OMPT_SUPPORT && OMPT_OPTIONAL 1746 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1747 kmp_team_t *team = this_thr->th.th_team; 1748 int tid = __kmp_tid_from_gtid(global_tid); 1749 1750 if (ompt_enabled.enabled) { 1751 if (rc) { 1752 if (ompt_enabled.ompt_callback_work) { 1753 ompt_callbacks.ompt_callback(ompt_callback_work)( 1754 ompt_work_single_executor, ompt_scope_begin, 1755 &(team->t.ompt_team_info.parallel_data), 1756 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1757 1, OMPT_GET_RETURN_ADDRESS(0)); 1758 } 1759 } else { 1760 if (ompt_enabled.ompt_callback_work) { 1761 ompt_callbacks.ompt_callback(ompt_callback_work)( 1762 ompt_work_single_other, ompt_scope_begin, 1763 &(team->t.ompt_team_info.parallel_data), 1764 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1765 1, OMPT_GET_RETURN_ADDRESS(0)); 1766 ompt_callbacks.ompt_callback(ompt_callback_work)( 1767 ompt_work_single_other, ompt_scope_end, 1768 &(team->t.ompt_team_info.parallel_data), 1769 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1770 1, OMPT_GET_RETURN_ADDRESS(0)); 1771 } 1772 } 1773 } 1774 #endif 1775 1776 return rc; 1777 } 1778 1779 /*! 1780 @ingroup WORK_SHARING 1781 @param loc source location information 1782 @param global_tid global thread number 1783 1784 Mark the end of a <tt>single</tt> construct. This function should 1785 only be called by the thread that executed the block of code protected 1786 by the `single` construct. 1787 */ 1788 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) { 1789 __kmp_exit_single(global_tid); 1790 KMP_POP_PARTITIONED_TIMER(); 1791 1792 #if OMPT_SUPPORT && OMPT_OPTIONAL 1793 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1794 kmp_team_t *team = this_thr->th.th_team; 1795 int tid = __kmp_tid_from_gtid(global_tid); 1796 1797 if (ompt_enabled.ompt_callback_work) { 1798 ompt_callbacks.ompt_callback(ompt_callback_work)( 1799 ompt_work_single_executor, ompt_scope_end, 1800 &(team->t.ompt_team_info.parallel_data), 1801 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1, 1802 OMPT_GET_RETURN_ADDRESS(0)); 1803 } 1804 #endif 1805 } 1806 1807 /*! 1808 @ingroup WORK_SHARING 1809 @param loc Source location 1810 @param global_tid Global thread id 1811 1812 Mark the end of a statically scheduled loop. 1813 */ 1814 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) { 1815 KMP_POP_PARTITIONED_TIMER(); 1816 KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid)); 1817 1818 #if OMPT_SUPPORT && OMPT_OPTIONAL 1819 if (ompt_enabled.ompt_callback_work) { 1820 ompt_work_t ompt_work_type = ompt_work_loop; 1821 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1822 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 1823 // Determine workshare type 1824 if (loc != NULL) { 1825 if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) { 1826 ompt_work_type = ompt_work_loop; 1827 } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) { 1828 ompt_work_type = ompt_work_sections; 1829 } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) { 1830 ompt_work_type = ompt_work_distribute; 1831 } else { 1832 // use default set above. 1833 // a warning about this case is provided in __kmpc_for_static_init 1834 } 1835 KMP_DEBUG_ASSERT(ompt_work_type); 1836 } 1837 ompt_callbacks.ompt_callback(ompt_callback_work)( 1838 ompt_work_type, ompt_scope_end, &(team_info->parallel_data), 1839 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0)); 1840 } 1841 #endif 1842 if (__kmp_env_consistency_check) 1843 __kmp_pop_workshare(global_tid, ct_pdo, loc); 1844 } 1845 1846 // User routines which take C-style arguments (call by value) 1847 // different from the Fortran equivalent routines 1848 1849 void ompc_set_num_threads(int arg) { 1850 // !!!!! TODO: check the per-task binding 1851 __kmp_set_num_threads(arg, __kmp_entry_gtid()); 1852 } 1853 1854 void ompc_set_dynamic(int flag) { 1855 kmp_info_t *thread; 1856 1857 /* For the thread-private implementation of the internal controls */ 1858 thread = __kmp_entry_thread(); 1859 1860 __kmp_save_internal_controls(thread); 1861 1862 set__dynamic(thread, flag ? TRUE : FALSE); 1863 } 1864 1865 void ompc_set_nested(int flag) { 1866 kmp_info_t *thread; 1867 1868 /* For the thread-private internal controls implementation */ 1869 thread = __kmp_entry_thread(); 1870 1871 __kmp_save_internal_controls(thread); 1872 1873 set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1); 1874 } 1875 1876 void ompc_set_max_active_levels(int max_active_levels) { 1877 /* TO DO */ 1878 /* we want per-task implementation of this internal control */ 1879 1880 /* For the per-thread internal controls implementation */ 1881 __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels); 1882 } 1883 1884 void ompc_set_schedule(omp_sched_t kind, int modifier) { 1885 // !!!!! TODO: check the per-task binding 1886 __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier); 1887 } 1888 1889 int ompc_get_ancestor_thread_num(int level) { 1890 return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level); 1891 } 1892 1893 int ompc_get_team_size(int level) { 1894 return __kmp_get_team_size(__kmp_entry_gtid(), level); 1895 } 1896 1897 /* OpenMP 5.0 Affinity Format API */ 1898 1899 void ompc_set_affinity_format(char const *format) { 1900 if (!__kmp_init_serial) { 1901 __kmp_serial_initialize(); 1902 } 1903 __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE, 1904 format, KMP_STRLEN(format) + 1); 1905 } 1906 1907 size_t ompc_get_affinity_format(char *buffer, size_t size) { 1908 size_t format_size; 1909 if (!__kmp_init_serial) { 1910 __kmp_serial_initialize(); 1911 } 1912 format_size = KMP_STRLEN(__kmp_affinity_format); 1913 if (buffer && size) { 1914 __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format, 1915 format_size + 1); 1916 } 1917 return format_size; 1918 } 1919 1920 void ompc_display_affinity(char const *format) { 1921 int gtid; 1922 if (!TCR_4(__kmp_init_middle)) { 1923 __kmp_middle_initialize(); 1924 } 1925 gtid = __kmp_get_gtid(); 1926 __kmp_aux_display_affinity(gtid, format); 1927 } 1928 1929 size_t ompc_capture_affinity(char *buffer, size_t buf_size, 1930 char const *format) { 1931 int gtid; 1932 size_t num_required; 1933 kmp_str_buf_t capture_buf; 1934 if (!TCR_4(__kmp_init_middle)) { 1935 __kmp_middle_initialize(); 1936 } 1937 gtid = __kmp_get_gtid(); 1938 __kmp_str_buf_init(&capture_buf); 1939 num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf); 1940 if (buffer && buf_size) { 1941 __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str, 1942 capture_buf.used + 1); 1943 } 1944 __kmp_str_buf_free(&capture_buf); 1945 return num_required; 1946 } 1947 1948 void kmpc_set_stacksize(int arg) { 1949 // __kmp_aux_set_stacksize initializes the library if needed 1950 __kmp_aux_set_stacksize(arg); 1951 } 1952 1953 void kmpc_set_stacksize_s(size_t arg) { 1954 // __kmp_aux_set_stacksize initializes the library if needed 1955 __kmp_aux_set_stacksize(arg); 1956 } 1957 1958 void kmpc_set_blocktime(int arg) { 1959 int gtid, tid; 1960 kmp_info_t *thread; 1961 1962 gtid = __kmp_entry_gtid(); 1963 tid = __kmp_tid_from_gtid(gtid); 1964 thread = __kmp_thread_from_gtid(gtid); 1965 1966 __kmp_aux_set_blocktime(arg, thread, tid); 1967 } 1968 1969 void kmpc_set_library(int arg) { 1970 // __kmp_user_set_library initializes the library if needed 1971 __kmp_user_set_library((enum library_type)arg); 1972 } 1973 1974 void kmpc_set_defaults(char const *str) { 1975 // __kmp_aux_set_defaults initializes the library if needed 1976 __kmp_aux_set_defaults(str, KMP_STRLEN(str)); 1977 } 1978 1979 void kmpc_set_disp_num_buffers(int arg) { 1980 // ignore after initialization because some teams have already 1981 // allocated dispatch buffers 1982 if (__kmp_init_serial == 0 && arg > 0) 1983 __kmp_dispatch_num_buffers = arg; 1984 } 1985 1986 int kmpc_set_affinity_mask_proc(int proc, void **mask) { 1987 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1988 return -1; 1989 #else 1990 if (!TCR_4(__kmp_init_middle)) { 1991 __kmp_middle_initialize(); 1992 } 1993 return __kmp_aux_set_affinity_mask_proc(proc, mask); 1994 #endif 1995 } 1996 1997 int kmpc_unset_affinity_mask_proc(int proc, void **mask) { 1998 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1999 return -1; 2000 #else 2001 if (!TCR_4(__kmp_init_middle)) { 2002 __kmp_middle_initialize(); 2003 } 2004 return __kmp_aux_unset_affinity_mask_proc(proc, mask); 2005 #endif 2006 } 2007 2008 int kmpc_get_affinity_mask_proc(int proc, void **mask) { 2009 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 2010 return -1; 2011 #else 2012 if (!TCR_4(__kmp_init_middle)) { 2013 __kmp_middle_initialize(); 2014 } 2015 return __kmp_aux_get_affinity_mask_proc(proc, mask); 2016 #endif 2017 } 2018 2019 /* -------------------------------------------------------------------------- */ 2020 /*! 2021 @ingroup THREADPRIVATE 2022 @param loc source location information 2023 @param gtid global thread number 2024 @param cpy_size size of the cpy_data buffer 2025 @param cpy_data pointer to data to be copied 2026 @param cpy_func helper function to call for copying data 2027 @param didit flag variable: 1=single thread; 0=not single thread 2028 2029 __kmpc_copyprivate implements the interface for the private data broadcast 2030 needed for the copyprivate clause associated with a single region in an 2031 OpenMP<sup>*</sup> program (both C and Fortran). 2032 All threads participating in the parallel region call this routine. 2033 One of the threads (called the single thread) should have the <tt>didit</tt> 2034 variable set to 1 and all other threads should have that variable set to 0. 2035 All threads pass a pointer to a data buffer (cpy_data) that they have built. 2036 2037 The OpenMP specification forbids the use of nowait on the single region when a 2038 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a 2039 barrier internally to avoid race conditions, so the code generation for the 2040 single region should avoid generating a barrier after the call to @ref 2041 __kmpc_copyprivate. 2042 2043 The <tt>gtid</tt> parameter is the global thread id for the current thread. 2044 The <tt>loc</tt> parameter is a pointer to source location information. 2045 2046 Internal implementation: The single thread will first copy its descriptor 2047 address (cpy_data) to a team-private location, then the other threads will each 2048 call the function pointed to by the parameter cpy_func, which carries out the 2049 copy by copying the data using the cpy_data buffer. 2050 2051 The cpy_func routine used for the copy and the contents of the data area defined 2052 by cpy_data and cpy_size may be built in any fashion that will allow the copy 2053 to be done. For instance, the cpy_data buffer can hold the actual data to be 2054 copied or it may hold a list of pointers to the data. The cpy_func routine must 2055 interpret the cpy_data buffer appropriately. 2056 2057 The interface to cpy_func is as follows: 2058 @code 2059 void cpy_func( void *destination, void *source ) 2060 @endcode 2061 where void *destination is the cpy_data pointer for the thread being copied to 2062 and void *source is the cpy_data pointer for the thread being copied from. 2063 */ 2064 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, 2065 void *cpy_data, void (*cpy_func)(void *, void *), 2066 kmp_int32 didit) { 2067 void **data_ptr; 2068 2069 KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid)); 2070 2071 KMP_MB(); 2072 2073 data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data; 2074 2075 if (__kmp_env_consistency_check) { 2076 if (loc == 0) { 2077 KMP_WARNING(ConstructIdentInvalid); 2078 } 2079 } 2080 2081 // ToDo: Optimize the following two barriers into some kind of split barrier 2082 2083 if (didit) 2084 *data_ptr = cpy_data; 2085 2086 #if OMPT_SUPPORT 2087 ompt_frame_t *ompt_frame; 2088 if (ompt_enabled.enabled) { 2089 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 2090 if (ompt_frame->enter_frame.ptr == NULL) 2091 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 2092 OMPT_STORE_RETURN_ADDRESS(gtid); 2093 } 2094 #endif 2095 /* This barrier is not a barrier region boundary */ 2096 #if USE_ITT_NOTIFY 2097 __kmp_threads[gtid]->th.th_ident = loc; 2098 #endif 2099 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2100 2101 if (!didit) 2102 (*cpy_func)(cpy_data, *data_ptr); 2103 2104 // Consider next barrier a user-visible barrier for barrier region boundaries 2105 // Nesting checks are already handled by the single construct checks 2106 2107 #if OMPT_SUPPORT 2108 if (ompt_enabled.enabled) { 2109 OMPT_STORE_RETURN_ADDRESS(gtid); 2110 } 2111 #endif 2112 #if USE_ITT_NOTIFY 2113 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. 2114 // tasks can overwrite the location) 2115 #endif 2116 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2117 #if OMPT_SUPPORT && OMPT_OPTIONAL 2118 if (ompt_enabled.enabled) { 2119 ompt_frame->enter_frame = ompt_data_none; 2120 } 2121 #endif 2122 } 2123 2124 /* -------------------------------------------------------------------------- */ 2125 2126 #define INIT_LOCK __kmp_init_user_lock_with_checks 2127 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks 2128 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks 2129 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed 2130 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks 2131 #define ACQUIRE_NESTED_LOCK_TIMED \ 2132 __kmp_acquire_nested_user_lock_with_checks_timed 2133 #define RELEASE_LOCK __kmp_release_user_lock_with_checks 2134 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks 2135 #define TEST_LOCK __kmp_test_user_lock_with_checks 2136 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks 2137 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks 2138 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks 2139 2140 // TODO: Make check abort messages use location info & pass it into 2141 // with_checks routines 2142 2143 #if KMP_USE_DYNAMIC_LOCK 2144 2145 // internal lock initializer 2146 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock, 2147 kmp_dyna_lockseq_t seq) { 2148 if (KMP_IS_D_LOCK(seq)) { 2149 KMP_INIT_D_LOCK(lock, seq); 2150 #if USE_ITT_BUILD 2151 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL); 2152 #endif 2153 } else { 2154 KMP_INIT_I_LOCK(lock, seq); 2155 #if USE_ITT_BUILD 2156 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 2157 __kmp_itt_lock_creating(ilk->lock, loc); 2158 #endif 2159 } 2160 } 2161 2162 // internal nest lock initializer 2163 static __forceinline void 2164 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock, 2165 kmp_dyna_lockseq_t seq) { 2166 #if KMP_USE_TSX 2167 // Don't have nested lock implementation for speculative locks 2168 if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive) 2169 seq = __kmp_user_lock_seq; 2170 #endif 2171 switch (seq) { 2172 case lockseq_tas: 2173 seq = lockseq_nested_tas; 2174 break; 2175 #if KMP_USE_FUTEX 2176 case lockseq_futex: 2177 seq = lockseq_nested_futex; 2178 break; 2179 #endif 2180 case lockseq_ticket: 2181 seq = lockseq_nested_ticket; 2182 break; 2183 case lockseq_queuing: 2184 seq = lockseq_nested_queuing; 2185 break; 2186 case lockseq_drdpa: 2187 seq = lockseq_nested_drdpa; 2188 break; 2189 default: 2190 seq = lockseq_nested_queuing; 2191 } 2192 KMP_INIT_I_LOCK(lock, seq); 2193 #if USE_ITT_BUILD 2194 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 2195 __kmp_itt_lock_creating(ilk->lock, loc); 2196 #endif 2197 } 2198 2199 /* initialize the lock with a hint */ 2200 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, 2201 uintptr_t hint) { 2202 KMP_DEBUG_ASSERT(__kmp_init_serial); 2203 if (__kmp_env_consistency_check && user_lock == NULL) { 2204 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint"); 2205 } 2206 2207 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 2208 2209 #if OMPT_SUPPORT && OMPT_OPTIONAL 2210 // This is the case, if called from omp_init_lock_with_hint: 2211 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2212 if (!codeptr) 2213 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2214 if (ompt_enabled.ompt_callback_lock_init) { 2215 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2216 ompt_mutex_lock, (omp_lock_hint_t)hint, 2217 __ompt_get_mutex_impl_type(user_lock), 2218 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2219 } 2220 #endif 2221 } 2222 2223 /* initialize the lock with a hint */ 2224 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid, 2225 void **user_lock, uintptr_t hint) { 2226 KMP_DEBUG_ASSERT(__kmp_init_serial); 2227 if (__kmp_env_consistency_check && user_lock == NULL) { 2228 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint"); 2229 } 2230 2231 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 2232 2233 #if OMPT_SUPPORT && OMPT_OPTIONAL 2234 // This is the case, if called from omp_init_lock_with_hint: 2235 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2236 if (!codeptr) 2237 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2238 if (ompt_enabled.ompt_callback_lock_init) { 2239 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2240 ompt_mutex_nest_lock, (omp_lock_hint_t)hint, 2241 __ompt_get_mutex_impl_type(user_lock), 2242 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2243 } 2244 #endif 2245 } 2246 2247 #endif // KMP_USE_DYNAMIC_LOCK 2248 2249 /* initialize the lock */ 2250 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2251 #if KMP_USE_DYNAMIC_LOCK 2252 2253 KMP_DEBUG_ASSERT(__kmp_init_serial); 2254 if (__kmp_env_consistency_check && user_lock == NULL) { 2255 KMP_FATAL(LockIsUninitialized, "omp_init_lock"); 2256 } 2257 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 2258 2259 #if OMPT_SUPPORT && OMPT_OPTIONAL 2260 // This is the case, if called from omp_init_lock_with_hint: 2261 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2262 if (!codeptr) 2263 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2264 if (ompt_enabled.ompt_callback_lock_init) { 2265 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2266 ompt_mutex_lock, omp_lock_hint_none, 2267 __ompt_get_mutex_impl_type(user_lock), 2268 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2269 } 2270 #endif 2271 2272 #else // KMP_USE_DYNAMIC_LOCK 2273 2274 static char const *const func = "omp_init_lock"; 2275 kmp_user_lock_p lck; 2276 KMP_DEBUG_ASSERT(__kmp_init_serial); 2277 2278 if (__kmp_env_consistency_check) { 2279 if (user_lock == NULL) { 2280 KMP_FATAL(LockIsUninitialized, func); 2281 } 2282 } 2283 2284 KMP_CHECK_USER_LOCK_INIT(); 2285 2286 if ((__kmp_user_lock_kind == lk_tas) && 2287 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2288 lck = (kmp_user_lock_p)user_lock; 2289 } 2290 #if KMP_USE_FUTEX 2291 else if ((__kmp_user_lock_kind == lk_futex) && 2292 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2293 lck = (kmp_user_lock_p)user_lock; 2294 } 2295 #endif 2296 else { 2297 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 2298 } 2299 INIT_LOCK(lck); 2300 __kmp_set_user_lock_location(lck, loc); 2301 2302 #if OMPT_SUPPORT && OMPT_OPTIONAL 2303 // This is the case, if called from omp_init_lock_with_hint: 2304 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2305 if (!codeptr) 2306 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2307 if (ompt_enabled.ompt_callback_lock_init) { 2308 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2309 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2310 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2311 } 2312 #endif 2313 2314 #if USE_ITT_BUILD 2315 __kmp_itt_lock_creating(lck); 2316 #endif /* USE_ITT_BUILD */ 2317 2318 #endif // KMP_USE_DYNAMIC_LOCK 2319 } // __kmpc_init_lock 2320 2321 /* initialize the lock */ 2322 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2323 #if KMP_USE_DYNAMIC_LOCK 2324 2325 KMP_DEBUG_ASSERT(__kmp_init_serial); 2326 if (__kmp_env_consistency_check && user_lock == NULL) { 2327 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock"); 2328 } 2329 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 2330 2331 #if OMPT_SUPPORT && OMPT_OPTIONAL 2332 // This is the case, if called from omp_init_lock_with_hint: 2333 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2334 if (!codeptr) 2335 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2336 if (ompt_enabled.ompt_callback_lock_init) { 2337 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2338 ompt_mutex_nest_lock, omp_lock_hint_none, 2339 __ompt_get_mutex_impl_type(user_lock), 2340 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2341 } 2342 #endif 2343 2344 #else // KMP_USE_DYNAMIC_LOCK 2345 2346 static char const *const func = "omp_init_nest_lock"; 2347 kmp_user_lock_p lck; 2348 KMP_DEBUG_ASSERT(__kmp_init_serial); 2349 2350 if (__kmp_env_consistency_check) { 2351 if (user_lock == NULL) { 2352 KMP_FATAL(LockIsUninitialized, func); 2353 } 2354 } 2355 2356 KMP_CHECK_USER_LOCK_INIT(); 2357 2358 if ((__kmp_user_lock_kind == lk_tas) && 2359 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2360 OMP_NEST_LOCK_T_SIZE)) { 2361 lck = (kmp_user_lock_p)user_lock; 2362 } 2363 #if KMP_USE_FUTEX 2364 else if ((__kmp_user_lock_kind == lk_futex) && 2365 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2366 OMP_NEST_LOCK_T_SIZE)) { 2367 lck = (kmp_user_lock_p)user_lock; 2368 } 2369 #endif 2370 else { 2371 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 2372 } 2373 2374 INIT_NESTED_LOCK(lck); 2375 __kmp_set_user_lock_location(lck, loc); 2376 2377 #if OMPT_SUPPORT && OMPT_OPTIONAL 2378 // This is the case, if called from omp_init_lock_with_hint: 2379 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2380 if (!codeptr) 2381 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2382 if (ompt_enabled.ompt_callback_lock_init) { 2383 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2384 ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2385 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2386 } 2387 #endif 2388 2389 #if USE_ITT_BUILD 2390 __kmp_itt_lock_creating(lck); 2391 #endif /* USE_ITT_BUILD */ 2392 2393 #endif // KMP_USE_DYNAMIC_LOCK 2394 } // __kmpc_init_nest_lock 2395 2396 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2397 #if KMP_USE_DYNAMIC_LOCK 2398 2399 #if USE_ITT_BUILD 2400 kmp_user_lock_p lck; 2401 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2402 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2403 } else { 2404 lck = (kmp_user_lock_p)user_lock; 2405 } 2406 __kmp_itt_lock_destroyed(lck); 2407 #endif 2408 #if OMPT_SUPPORT && OMPT_OPTIONAL 2409 // This is the case, if called from omp_init_lock_with_hint: 2410 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2411 if (!codeptr) 2412 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2413 if (ompt_enabled.ompt_callback_lock_destroy) { 2414 kmp_user_lock_p lck; 2415 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2416 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2417 } else { 2418 lck = (kmp_user_lock_p)user_lock; 2419 } 2420 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2421 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2422 } 2423 #endif 2424 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2425 #else 2426 kmp_user_lock_p lck; 2427 2428 if ((__kmp_user_lock_kind == lk_tas) && 2429 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2430 lck = (kmp_user_lock_p)user_lock; 2431 } 2432 #if KMP_USE_FUTEX 2433 else if ((__kmp_user_lock_kind == lk_futex) && 2434 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2435 lck = (kmp_user_lock_p)user_lock; 2436 } 2437 #endif 2438 else { 2439 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock"); 2440 } 2441 2442 #if OMPT_SUPPORT && OMPT_OPTIONAL 2443 // This is the case, if called from omp_init_lock_with_hint: 2444 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2445 if (!codeptr) 2446 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2447 if (ompt_enabled.ompt_callback_lock_destroy) { 2448 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2449 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2450 } 2451 #endif 2452 2453 #if USE_ITT_BUILD 2454 __kmp_itt_lock_destroyed(lck); 2455 #endif /* USE_ITT_BUILD */ 2456 DESTROY_LOCK(lck); 2457 2458 if ((__kmp_user_lock_kind == lk_tas) && 2459 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2460 ; 2461 } 2462 #if KMP_USE_FUTEX 2463 else if ((__kmp_user_lock_kind == lk_futex) && 2464 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2465 ; 2466 } 2467 #endif 2468 else { 2469 __kmp_user_lock_free(user_lock, gtid, lck); 2470 } 2471 #endif // KMP_USE_DYNAMIC_LOCK 2472 } // __kmpc_destroy_lock 2473 2474 /* destroy the lock */ 2475 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2476 #if KMP_USE_DYNAMIC_LOCK 2477 2478 #if USE_ITT_BUILD 2479 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock); 2480 __kmp_itt_lock_destroyed(ilk->lock); 2481 #endif 2482 #if OMPT_SUPPORT && OMPT_OPTIONAL 2483 // This is the case, if called from omp_init_lock_with_hint: 2484 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2485 if (!codeptr) 2486 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2487 if (ompt_enabled.ompt_callback_lock_destroy) { 2488 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2489 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2490 } 2491 #endif 2492 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2493 2494 #else // KMP_USE_DYNAMIC_LOCK 2495 2496 kmp_user_lock_p lck; 2497 2498 if ((__kmp_user_lock_kind == lk_tas) && 2499 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2500 OMP_NEST_LOCK_T_SIZE)) { 2501 lck = (kmp_user_lock_p)user_lock; 2502 } 2503 #if KMP_USE_FUTEX 2504 else if ((__kmp_user_lock_kind == lk_futex) && 2505 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2506 OMP_NEST_LOCK_T_SIZE)) { 2507 lck = (kmp_user_lock_p)user_lock; 2508 } 2509 #endif 2510 else { 2511 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock"); 2512 } 2513 2514 #if OMPT_SUPPORT && OMPT_OPTIONAL 2515 // This is the case, if called from omp_init_lock_with_hint: 2516 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2517 if (!codeptr) 2518 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2519 if (ompt_enabled.ompt_callback_lock_destroy) { 2520 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2521 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2522 } 2523 #endif 2524 2525 #if USE_ITT_BUILD 2526 __kmp_itt_lock_destroyed(lck); 2527 #endif /* USE_ITT_BUILD */ 2528 2529 DESTROY_NESTED_LOCK(lck); 2530 2531 if ((__kmp_user_lock_kind == lk_tas) && 2532 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2533 OMP_NEST_LOCK_T_SIZE)) { 2534 ; 2535 } 2536 #if KMP_USE_FUTEX 2537 else if ((__kmp_user_lock_kind == lk_futex) && 2538 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2539 OMP_NEST_LOCK_T_SIZE)) { 2540 ; 2541 } 2542 #endif 2543 else { 2544 __kmp_user_lock_free(user_lock, gtid, lck); 2545 } 2546 #endif // KMP_USE_DYNAMIC_LOCK 2547 } // __kmpc_destroy_nest_lock 2548 2549 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2550 KMP_COUNT_BLOCK(OMP_set_lock); 2551 #if KMP_USE_DYNAMIC_LOCK 2552 int tag = KMP_EXTRACT_D_TAG(user_lock); 2553 #if USE_ITT_BUILD 2554 __kmp_itt_lock_acquiring( 2555 (kmp_user_lock_p) 2556 user_lock); // itt function will get to the right lock object. 2557 #endif 2558 #if OMPT_SUPPORT && OMPT_OPTIONAL 2559 // This is the case, if called from omp_init_lock_with_hint: 2560 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2561 if (!codeptr) 2562 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2563 if (ompt_enabled.ompt_callback_mutex_acquire) { 2564 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2565 ompt_mutex_lock, omp_lock_hint_none, 2566 __ompt_get_mutex_impl_type(user_lock), 2567 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2568 } 2569 #endif 2570 #if KMP_USE_INLINED_TAS 2571 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2572 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid); 2573 } else 2574 #elif KMP_USE_INLINED_FUTEX 2575 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2576 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid); 2577 } else 2578 #endif 2579 { 2580 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2581 } 2582 #if USE_ITT_BUILD 2583 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2584 #endif 2585 #if OMPT_SUPPORT && OMPT_OPTIONAL 2586 if (ompt_enabled.ompt_callback_mutex_acquired) { 2587 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2588 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2589 } 2590 #endif 2591 2592 #else // KMP_USE_DYNAMIC_LOCK 2593 2594 kmp_user_lock_p lck; 2595 2596 if ((__kmp_user_lock_kind == lk_tas) && 2597 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2598 lck = (kmp_user_lock_p)user_lock; 2599 } 2600 #if KMP_USE_FUTEX 2601 else if ((__kmp_user_lock_kind == lk_futex) && 2602 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2603 lck = (kmp_user_lock_p)user_lock; 2604 } 2605 #endif 2606 else { 2607 lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock"); 2608 } 2609 2610 #if USE_ITT_BUILD 2611 __kmp_itt_lock_acquiring(lck); 2612 #endif /* USE_ITT_BUILD */ 2613 #if OMPT_SUPPORT && OMPT_OPTIONAL 2614 // This is the case, if called from omp_init_lock_with_hint: 2615 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2616 if (!codeptr) 2617 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2618 if (ompt_enabled.ompt_callback_mutex_acquire) { 2619 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2620 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2621 (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2622 } 2623 #endif 2624 2625 ACQUIRE_LOCK(lck, gtid); 2626 2627 #if USE_ITT_BUILD 2628 __kmp_itt_lock_acquired(lck); 2629 #endif /* USE_ITT_BUILD */ 2630 2631 #if OMPT_SUPPORT && OMPT_OPTIONAL 2632 if (ompt_enabled.ompt_callback_mutex_acquired) { 2633 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2634 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2635 } 2636 #endif 2637 2638 #endif // KMP_USE_DYNAMIC_LOCK 2639 } 2640 2641 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2642 #if KMP_USE_DYNAMIC_LOCK 2643 2644 #if USE_ITT_BUILD 2645 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2646 #endif 2647 #if OMPT_SUPPORT && OMPT_OPTIONAL 2648 // This is the case, if called from omp_init_lock_with_hint: 2649 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2650 if (!codeptr) 2651 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2652 if (ompt_enabled.enabled) { 2653 if (ompt_enabled.ompt_callback_mutex_acquire) { 2654 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2655 ompt_mutex_nest_lock, omp_lock_hint_none, 2656 __ompt_get_mutex_impl_type(user_lock), 2657 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2658 } 2659 } 2660 #endif 2661 int acquire_status = 2662 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid); 2663 (void) acquire_status; 2664 #if USE_ITT_BUILD 2665 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2666 #endif 2667 2668 #if OMPT_SUPPORT && OMPT_OPTIONAL 2669 if (ompt_enabled.enabled) { 2670 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2671 if (ompt_enabled.ompt_callback_mutex_acquired) { 2672 // lock_first 2673 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2674 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, 2675 codeptr); 2676 } 2677 } else { 2678 if (ompt_enabled.ompt_callback_nest_lock) { 2679 // lock_next 2680 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2681 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2682 } 2683 } 2684 } 2685 #endif 2686 2687 #else // KMP_USE_DYNAMIC_LOCK 2688 int acquire_status; 2689 kmp_user_lock_p lck; 2690 2691 if ((__kmp_user_lock_kind == lk_tas) && 2692 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2693 OMP_NEST_LOCK_T_SIZE)) { 2694 lck = (kmp_user_lock_p)user_lock; 2695 } 2696 #if KMP_USE_FUTEX 2697 else if ((__kmp_user_lock_kind == lk_futex) && 2698 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2699 OMP_NEST_LOCK_T_SIZE)) { 2700 lck = (kmp_user_lock_p)user_lock; 2701 } 2702 #endif 2703 else { 2704 lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock"); 2705 } 2706 2707 #if USE_ITT_BUILD 2708 __kmp_itt_lock_acquiring(lck); 2709 #endif /* USE_ITT_BUILD */ 2710 #if OMPT_SUPPORT && OMPT_OPTIONAL 2711 // This is the case, if called from omp_init_lock_with_hint: 2712 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2713 if (!codeptr) 2714 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2715 if (ompt_enabled.enabled) { 2716 if (ompt_enabled.ompt_callback_mutex_acquire) { 2717 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2718 ompt_mutex_nest_lock, omp_lock_hint_none, 2719 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck, 2720 codeptr); 2721 } 2722 } 2723 #endif 2724 2725 ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status); 2726 2727 #if USE_ITT_BUILD 2728 __kmp_itt_lock_acquired(lck); 2729 #endif /* USE_ITT_BUILD */ 2730 2731 #if OMPT_SUPPORT && OMPT_OPTIONAL 2732 if (ompt_enabled.enabled) { 2733 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2734 if (ompt_enabled.ompt_callback_mutex_acquired) { 2735 // lock_first 2736 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2737 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2738 } 2739 } else { 2740 if (ompt_enabled.ompt_callback_nest_lock) { 2741 // lock_next 2742 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2743 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2744 } 2745 } 2746 } 2747 #endif 2748 2749 #endif // KMP_USE_DYNAMIC_LOCK 2750 } 2751 2752 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2753 #if KMP_USE_DYNAMIC_LOCK 2754 2755 int tag = KMP_EXTRACT_D_TAG(user_lock); 2756 #if USE_ITT_BUILD 2757 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2758 #endif 2759 #if KMP_USE_INLINED_TAS 2760 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2761 KMP_RELEASE_TAS_LOCK(user_lock, gtid); 2762 } else 2763 #elif KMP_USE_INLINED_FUTEX 2764 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2765 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid); 2766 } else 2767 #endif 2768 { 2769 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2770 } 2771 2772 #if OMPT_SUPPORT && OMPT_OPTIONAL 2773 // This is the case, if called from omp_init_lock_with_hint: 2774 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2775 if (!codeptr) 2776 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2777 if (ompt_enabled.ompt_callback_mutex_released) { 2778 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2779 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2780 } 2781 #endif 2782 2783 #else // KMP_USE_DYNAMIC_LOCK 2784 2785 kmp_user_lock_p lck; 2786 2787 /* Can't use serial interval since not block structured */ 2788 /* release the lock */ 2789 2790 if ((__kmp_user_lock_kind == lk_tas) && 2791 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2792 #if KMP_OS_LINUX && \ 2793 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2794 // "fast" path implemented to fix customer performance issue 2795 #if USE_ITT_BUILD 2796 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2797 #endif /* USE_ITT_BUILD */ 2798 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0); 2799 KMP_MB(); 2800 2801 #if OMPT_SUPPORT && OMPT_OPTIONAL 2802 // This is the case, if called from omp_init_lock_with_hint: 2803 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2804 if (!codeptr) 2805 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2806 if (ompt_enabled.ompt_callback_mutex_released) { 2807 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2808 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2809 } 2810 #endif 2811 2812 return; 2813 #else 2814 lck = (kmp_user_lock_p)user_lock; 2815 #endif 2816 } 2817 #if KMP_USE_FUTEX 2818 else if ((__kmp_user_lock_kind == lk_futex) && 2819 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2820 lck = (kmp_user_lock_p)user_lock; 2821 } 2822 #endif 2823 else { 2824 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock"); 2825 } 2826 2827 #if USE_ITT_BUILD 2828 __kmp_itt_lock_releasing(lck); 2829 #endif /* USE_ITT_BUILD */ 2830 2831 RELEASE_LOCK(lck, gtid); 2832 2833 #if OMPT_SUPPORT && OMPT_OPTIONAL 2834 // This is the case, if called from omp_init_lock_with_hint: 2835 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2836 if (!codeptr) 2837 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2838 if (ompt_enabled.ompt_callback_mutex_released) { 2839 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2840 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2841 } 2842 #endif 2843 2844 #endif // KMP_USE_DYNAMIC_LOCK 2845 } 2846 2847 /* release the lock */ 2848 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2849 #if KMP_USE_DYNAMIC_LOCK 2850 2851 #if USE_ITT_BUILD 2852 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2853 #endif 2854 int release_status = 2855 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid); 2856 (void) release_status; 2857 2858 #if OMPT_SUPPORT && OMPT_OPTIONAL 2859 // This is the case, if called from omp_init_lock_with_hint: 2860 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2861 if (!codeptr) 2862 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2863 if (ompt_enabled.enabled) { 2864 if (release_status == KMP_LOCK_RELEASED) { 2865 if (ompt_enabled.ompt_callback_mutex_released) { 2866 // release_lock_last 2867 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2868 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, 2869 codeptr); 2870 } 2871 } else if (ompt_enabled.ompt_callback_nest_lock) { 2872 // release_lock_prev 2873 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2874 ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2875 } 2876 } 2877 #endif 2878 2879 #else // KMP_USE_DYNAMIC_LOCK 2880 2881 kmp_user_lock_p lck; 2882 2883 /* Can't use serial interval since not block structured */ 2884 2885 if ((__kmp_user_lock_kind == lk_tas) && 2886 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2887 OMP_NEST_LOCK_T_SIZE)) { 2888 #if KMP_OS_LINUX && \ 2889 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2890 // "fast" path implemented to fix customer performance issue 2891 kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock; 2892 #if USE_ITT_BUILD 2893 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2894 #endif /* USE_ITT_BUILD */ 2895 2896 #if OMPT_SUPPORT && OMPT_OPTIONAL 2897 int release_status = KMP_LOCK_STILL_HELD; 2898 #endif 2899 2900 if (--(tl->lk.depth_locked) == 0) { 2901 TCW_4(tl->lk.poll, 0); 2902 #if OMPT_SUPPORT && OMPT_OPTIONAL 2903 release_status = KMP_LOCK_RELEASED; 2904 #endif 2905 } 2906 KMP_MB(); 2907 2908 #if OMPT_SUPPORT && OMPT_OPTIONAL 2909 // This is the case, if called from omp_init_lock_with_hint: 2910 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2911 if (!codeptr) 2912 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2913 if (ompt_enabled.enabled) { 2914 if (release_status == KMP_LOCK_RELEASED) { 2915 if (ompt_enabled.ompt_callback_mutex_released) { 2916 // release_lock_last 2917 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2918 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2919 } 2920 } else if (ompt_enabled.ompt_callback_nest_lock) { 2921 // release_lock_previous 2922 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2923 ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2924 } 2925 } 2926 #endif 2927 2928 return; 2929 #else 2930 lck = (kmp_user_lock_p)user_lock; 2931 #endif 2932 } 2933 #if KMP_USE_FUTEX 2934 else if ((__kmp_user_lock_kind == lk_futex) && 2935 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2936 OMP_NEST_LOCK_T_SIZE)) { 2937 lck = (kmp_user_lock_p)user_lock; 2938 } 2939 #endif 2940 else { 2941 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock"); 2942 } 2943 2944 #if USE_ITT_BUILD 2945 __kmp_itt_lock_releasing(lck); 2946 #endif /* USE_ITT_BUILD */ 2947 2948 int release_status; 2949 release_status = RELEASE_NESTED_LOCK(lck, gtid); 2950 #if OMPT_SUPPORT && OMPT_OPTIONAL 2951 // This is the case, if called from omp_init_lock_with_hint: 2952 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2953 if (!codeptr) 2954 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2955 if (ompt_enabled.enabled) { 2956 if (release_status == KMP_LOCK_RELEASED) { 2957 if (ompt_enabled.ompt_callback_mutex_released) { 2958 // release_lock_last 2959 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2960 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2961 } 2962 } else if (ompt_enabled.ompt_callback_nest_lock) { 2963 // release_lock_previous 2964 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2965 ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2966 } 2967 } 2968 #endif 2969 2970 #endif // KMP_USE_DYNAMIC_LOCK 2971 } 2972 2973 /* try to acquire the lock */ 2974 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2975 KMP_COUNT_BLOCK(OMP_test_lock); 2976 2977 #if KMP_USE_DYNAMIC_LOCK 2978 int rc; 2979 int tag = KMP_EXTRACT_D_TAG(user_lock); 2980 #if USE_ITT_BUILD 2981 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2982 #endif 2983 #if OMPT_SUPPORT && OMPT_OPTIONAL 2984 // This is the case, if called from omp_init_lock_with_hint: 2985 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2986 if (!codeptr) 2987 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2988 if (ompt_enabled.ompt_callback_mutex_acquire) { 2989 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2990 ompt_mutex_lock, omp_lock_hint_none, 2991 __ompt_get_mutex_impl_type(user_lock), 2992 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2993 } 2994 #endif 2995 #if KMP_USE_INLINED_TAS 2996 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2997 KMP_TEST_TAS_LOCK(user_lock, gtid, rc); 2998 } else 2999 #elif KMP_USE_INLINED_FUTEX 3000 if (tag == locktag_futex && !__kmp_env_consistency_check) { 3001 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc); 3002 } else 3003 #endif 3004 { 3005 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid); 3006 } 3007 if (rc) { 3008 #if USE_ITT_BUILD 3009 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 3010 #endif 3011 #if OMPT_SUPPORT && OMPT_OPTIONAL 3012 if (ompt_enabled.ompt_callback_mutex_acquired) { 3013 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3014 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 3015 } 3016 #endif 3017 return FTN_TRUE; 3018 } else { 3019 #if USE_ITT_BUILD 3020 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 3021 #endif 3022 return FTN_FALSE; 3023 } 3024 3025 #else // KMP_USE_DYNAMIC_LOCK 3026 3027 kmp_user_lock_p lck; 3028 int rc; 3029 3030 if ((__kmp_user_lock_kind == lk_tas) && 3031 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 3032 lck = (kmp_user_lock_p)user_lock; 3033 } 3034 #if KMP_USE_FUTEX 3035 else if ((__kmp_user_lock_kind == lk_futex) && 3036 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 3037 lck = (kmp_user_lock_p)user_lock; 3038 } 3039 #endif 3040 else { 3041 lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock"); 3042 } 3043 3044 #if USE_ITT_BUILD 3045 __kmp_itt_lock_acquiring(lck); 3046 #endif /* USE_ITT_BUILD */ 3047 #if OMPT_SUPPORT && OMPT_OPTIONAL 3048 // This is the case, if called from omp_init_lock_with_hint: 3049 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 3050 if (!codeptr) 3051 codeptr = OMPT_GET_RETURN_ADDRESS(0); 3052 if (ompt_enabled.ompt_callback_mutex_acquire) { 3053 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 3054 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 3055 (ompt_wait_id_t)(uintptr_t)lck, codeptr); 3056 } 3057 #endif 3058 3059 rc = TEST_LOCK(lck, gtid); 3060 #if USE_ITT_BUILD 3061 if (rc) { 3062 __kmp_itt_lock_acquired(lck); 3063 } else { 3064 __kmp_itt_lock_cancelled(lck); 3065 } 3066 #endif /* USE_ITT_BUILD */ 3067 #if OMPT_SUPPORT && OMPT_OPTIONAL 3068 if (rc && ompt_enabled.ompt_callback_mutex_acquired) { 3069 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3070 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 3071 } 3072 #endif 3073 3074 return (rc ? FTN_TRUE : FTN_FALSE); 3075 3076 /* Can't use serial interval since not block structured */ 3077 3078 #endif // KMP_USE_DYNAMIC_LOCK 3079 } 3080 3081 /* try to acquire the lock */ 3082 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 3083 #if KMP_USE_DYNAMIC_LOCK 3084 int rc; 3085 #if USE_ITT_BUILD 3086 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 3087 #endif 3088 #if OMPT_SUPPORT && OMPT_OPTIONAL 3089 // This is the case, if called from omp_init_lock_with_hint: 3090 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 3091 if (!codeptr) 3092 codeptr = OMPT_GET_RETURN_ADDRESS(0); 3093 if (ompt_enabled.ompt_callback_mutex_acquire) { 3094 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 3095 ompt_mutex_nest_lock, omp_lock_hint_none, 3096 __ompt_get_mutex_impl_type(user_lock), 3097 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 3098 } 3099 #endif 3100 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid); 3101 #if USE_ITT_BUILD 3102 if (rc) { 3103 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 3104 } else { 3105 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 3106 } 3107 #endif 3108 #if OMPT_SUPPORT && OMPT_OPTIONAL 3109 if (ompt_enabled.enabled && rc) { 3110 if (rc == 1) { 3111 if (ompt_enabled.ompt_callback_mutex_acquired) { 3112 // lock_first 3113 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3114 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, 3115 codeptr); 3116 } 3117 } else { 3118 if (ompt_enabled.ompt_callback_nest_lock) { 3119 // lock_next 3120 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 3121 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 3122 } 3123 } 3124 } 3125 #endif 3126 return rc; 3127 3128 #else // KMP_USE_DYNAMIC_LOCK 3129 3130 kmp_user_lock_p lck; 3131 int rc; 3132 3133 if ((__kmp_user_lock_kind == lk_tas) && 3134 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 3135 OMP_NEST_LOCK_T_SIZE)) { 3136 lck = (kmp_user_lock_p)user_lock; 3137 } 3138 #if KMP_USE_FUTEX 3139 else if ((__kmp_user_lock_kind == lk_futex) && 3140 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 3141 OMP_NEST_LOCK_T_SIZE)) { 3142 lck = (kmp_user_lock_p)user_lock; 3143 } 3144 #endif 3145 else { 3146 lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock"); 3147 } 3148 3149 #if USE_ITT_BUILD 3150 __kmp_itt_lock_acquiring(lck); 3151 #endif /* USE_ITT_BUILD */ 3152 3153 #if OMPT_SUPPORT && OMPT_OPTIONAL 3154 // This is the case, if called from omp_init_lock_with_hint: 3155 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 3156 if (!codeptr) 3157 codeptr = OMPT_GET_RETURN_ADDRESS(0); 3158 if (ompt_enabled.enabled) && 3159 ompt_enabled.ompt_callback_mutex_acquire) { 3160 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 3161 ompt_mutex_nest_lock, omp_lock_hint_none, 3162 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck, 3163 codeptr); 3164 } 3165 #endif 3166 3167 rc = TEST_NESTED_LOCK(lck, gtid); 3168 #if USE_ITT_BUILD 3169 if (rc) { 3170 __kmp_itt_lock_acquired(lck); 3171 } else { 3172 __kmp_itt_lock_cancelled(lck); 3173 } 3174 #endif /* USE_ITT_BUILD */ 3175 #if OMPT_SUPPORT && OMPT_OPTIONAL 3176 if (ompt_enabled.enabled && rc) { 3177 if (rc == 1) { 3178 if (ompt_enabled.ompt_callback_mutex_acquired) { 3179 // lock_first 3180 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3181 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 3182 } 3183 } else { 3184 if (ompt_enabled.ompt_callback_nest_lock) { 3185 // lock_next 3186 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 3187 ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 3188 } 3189 } 3190 } 3191 #endif 3192 return rc; 3193 3194 /* Can't use serial interval since not block structured */ 3195 3196 #endif // KMP_USE_DYNAMIC_LOCK 3197 } 3198 3199 // Interface to fast scalable reduce methods routines 3200 3201 // keep the selected method in a thread local structure for cross-function 3202 // usage: will be used in __kmpc_end_reduce* functions; 3203 // another solution: to re-determine the method one more time in 3204 // __kmpc_end_reduce* functions (new prototype required then) 3205 // AT: which solution is better? 3206 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \ 3207 ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod)) 3208 3209 #define __KMP_GET_REDUCTION_METHOD(gtid) \ 3210 (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) 3211 3212 // description of the packed_reduction_method variable: look at the macros in 3213 // kmp.h 3214 3215 // used in a critical section reduce block 3216 static __forceinline void 3217 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 3218 kmp_critical_name *crit) { 3219 3220 // this lock was visible to a customer and to the threading profile tool as a 3221 // serial overhead span (although it's used for an internal purpose only) 3222 // why was it visible in previous implementation? 3223 // should we keep it visible in new reduce block? 3224 kmp_user_lock_p lck; 3225 3226 #if KMP_USE_DYNAMIC_LOCK 3227 3228 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 3229 // Check if it is initialized. 3230 if (*lk == 0) { 3231 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 3232 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 3233 KMP_GET_D_TAG(__kmp_user_lock_seq)); 3234 } else { 3235 __kmp_init_indirect_csptr(crit, loc, global_tid, 3236 KMP_GET_I_TAG(__kmp_user_lock_seq)); 3237 } 3238 } 3239 // Branch for accessing the actual lock object and set operation. This 3240 // branching is inevitable since this lock initialization does not follow the 3241 // normal dispatch path (lock table is not used). 3242 if (KMP_EXTRACT_D_TAG(lk) != 0) { 3243 lck = (kmp_user_lock_p)lk; 3244 KMP_DEBUG_ASSERT(lck != NULL); 3245 if (__kmp_env_consistency_check) { 3246 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 3247 } 3248 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 3249 } else { 3250 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 3251 lck = ilk->lock; 3252 KMP_DEBUG_ASSERT(lck != NULL); 3253 if (__kmp_env_consistency_check) { 3254 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 3255 } 3256 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 3257 } 3258 3259 #else // KMP_USE_DYNAMIC_LOCK 3260 3261 // We know that the fast reduction code is only emitted by Intel compilers 3262 // with 32 byte critical sections. If there isn't enough space, then we 3263 // have to use a pointer. 3264 if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) { 3265 lck = (kmp_user_lock_p)crit; 3266 } else { 3267 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 3268 } 3269 KMP_DEBUG_ASSERT(lck != NULL); 3270 3271 if (__kmp_env_consistency_check) 3272 __kmp_push_sync(global_tid, ct_critical, loc, lck); 3273 3274 __kmp_acquire_user_lock_with_checks(lck, global_tid); 3275 3276 #endif // KMP_USE_DYNAMIC_LOCK 3277 } 3278 3279 // used in a critical section reduce block 3280 static __forceinline void 3281 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 3282 kmp_critical_name *crit) { 3283 3284 kmp_user_lock_p lck; 3285 3286 #if KMP_USE_DYNAMIC_LOCK 3287 3288 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 3289 lck = (kmp_user_lock_p)crit; 3290 if (__kmp_env_consistency_check) 3291 __kmp_pop_sync(global_tid, ct_critical, loc); 3292 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 3293 } else { 3294 kmp_indirect_lock_t *ilk = 3295 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 3296 if (__kmp_env_consistency_check) 3297 __kmp_pop_sync(global_tid, ct_critical, loc); 3298 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid); 3299 } 3300 3301 #else // KMP_USE_DYNAMIC_LOCK 3302 3303 // We know that the fast reduction code is only emitted by Intel compilers 3304 // with 32 byte critical sections. If there isn't enough space, then we have 3305 // to use a pointer. 3306 if (__kmp_base_user_lock_size > 32) { 3307 lck = *((kmp_user_lock_p *)crit); 3308 KMP_ASSERT(lck != NULL); 3309 } else { 3310 lck = (kmp_user_lock_p)crit; 3311 } 3312 3313 if (__kmp_env_consistency_check) 3314 __kmp_pop_sync(global_tid, ct_critical, loc); 3315 3316 __kmp_release_user_lock_with_checks(lck, global_tid); 3317 3318 #endif // KMP_USE_DYNAMIC_LOCK 3319 } // __kmp_end_critical_section_reduce_block 3320 3321 static __forceinline int 3322 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p, 3323 int *task_state) { 3324 kmp_team_t *team; 3325 3326 // Check if we are inside the teams construct? 3327 if (th->th.th_teams_microtask) { 3328 *team_p = team = th->th.th_team; 3329 if (team->t.t_level == th->th.th_teams_level) { 3330 // This is reduction at teams construct. 3331 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0 3332 // Let's swap teams temporarily for the reduction. 3333 th->th.th_info.ds.ds_tid = team->t.t_master_tid; 3334 th->th.th_team = team->t.t_parent; 3335 th->th.th_team_nproc = th->th.th_team->t.t_nproc; 3336 th->th.th_task_team = th->th.th_team->t.t_task_team[0]; 3337 *task_state = th->th.th_task_state; 3338 th->th.th_task_state = 0; 3339 3340 return 1; 3341 } 3342 } 3343 return 0; 3344 } 3345 3346 static __forceinline void 3347 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) { 3348 // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction. 3349 th->th.th_info.ds.ds_tid = 0; 3350 th->th.th_team = team; 3351 th->th.th_team_nproc = team->t.t_nproc; 3352 th->th.th_task_team = team->t.t_task_team[task_state]; 3353 th->th.th_task_state = task_state; 3354 } 3355 3356 /* 2.a.i. Reduce Block without a terminating barrier */ 3357 /*! 3358 @ingroup SYNCHRONIZATION 3359 @param loc source location information 3360 @param global_tid global thread number 3361 @param num_vars number of items (variables) to be reduced 3362 @param reduce_size size of data in bytes to be reduced 3363 @param reduce_data pointer to data to be reduced 3364 @param reduce_func callback function providing reduction operation on two 3365 operands and returning result of reduction in lhs_data 3366 @param lck pointer to the unique lock data structure 3367 @result 1 for the master thread, 0 for all other team threads, 2 for all team 3368 threads if atomic reduction needed 3369 3370 The nowait version is used for a reduce clause with the nowait argument. 3371 */ 3372 kmp_int32 3373 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 3374 size_t reduce_size, void *reduce_data, 3375 void (*reduce_func)(void *lhs_data, void *rhs_data), 3376 kmp_critical_name *lck) { 3377 3378 KMP_COUNT_BLOCK(REDUCE_nowait); 3379 int retval = 0; 3380 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3381 kmp_info_t *th; 3382 kmp_team_t *team; 3383 int teams_swapped = 0, task_state; 3384 KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid)); 3385 3386 // why do we need this initialization here at all? 3387 // Reduction clause can not be used as a stand-alone directive. 3388 3389 // do not call __kmp_serial_initialize(), it will be called by 3390 // __kmp_parallel_initialize() if needed 3391 // possible detection of false-positive race by the threadchecker ??? 3392 if (!TCR_4(__kmp_init_parallel)) 3393 __kmp_parallel_initialize(); 3394 3395 __kmp_resume_if_soft_paused(); 3396 3397 // check correctness of reduce block nesting 3398 #if KMP_USE_DYNAMIC_LOCK 3399 if (__kmp_env_consistency_check) 3400 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 3401 #else 3402 if (__kmp_env_consistency_check) 3403 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 3404 #endif 3405 3406 th = __kmp_thread_from_gtid(global_tid); 3407 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3408 3409 // packed_reduction_method value will be reused by __kmp_end_reduce* function, 3410 // the value should be kept in a variable 3411 // the variable should be either a construct-specific or thread-specific 3412 // property, not a team specific property 3413 // (a thread can reach the next reduce block on the next construct, reduce 3414 // method may differ on the next construct) 3415 // an ident_t "loc" parameter could be used as a construct-specific property 3416 // (what if loc == 0?) 3417 // (if both construct-specific and team-specific variables were shared, 3418 // then unness extra syncs should be needed) 3419 // a thread-specific variable is better regarding two issues above (next 3420 // construct and extra syncs) 3421 // a thread-specific "th_local.reduction_method" variable is used currently 3422 // each thread executes 'determine' and 'set' lines (no need to execute by one 3423 // thread, to avoid unness extra syncs) 3424 3425 packed_reduction_method = __kmp_determine_reduction_method( 3426 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 3427 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 3428 3429 OMPT_REDUCTION_DECL(th, global_tid); 3430 if (packed_reduction_method == critical_reduce_block) { 3431 3432 OMPT_REDUCTION_BEGIN; 3433 3434 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 3435 retval = 1; 3436 3437 } else if (packed_reduction_method == empty_reduce_block) { 3438 3439 OMPT_REDUCTION_BEGIN; 3440 3441 // usage: if team size == 1, no synchronization is required ( Intel 3442 // platforms only ) 3443 retval = 1; 3444 3445 } else if (packed_reduction_method == atomic_reduce_block) { 3446 3447 retval = 2; 3448 3449 // all threads should do this pop here (because __kmpc_end_reduce_nowait() 3450 // won't be called by the code gen) 3451 // (it's not quite good, because the checking block has been closed by 3452 // this 'pop', 3453 // but atomic operation has not been executed yet, will be executed 3454 // slightly later, literally on next instruction) 3455 if (__kmp_env_consistency_check) 3456 __kmp_pop_sync(global_tid, ct_reduce, loc); 3457 3458 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3459 tree_reduce_block)) { 3460 3461 // AT: performance issue: a real barrier here 3462 // AT: (if master goes slow, other threads are blocked here waiting for the 3463 // master to come and release them) 3464 // AT: (it's not what a customer might expect specifying NOWAIT clause) 3465 // AT: (specifying NOWAIT won't result in improvement of performance, it'll 3466 // be confusing to a customer) 3467 // AT: another implementation of *barrier_gather*nowait() (or some other design) 3468 // might go faster and be more in line with sense of NOWAIT 3469 // AT: TO DO: do epcc test and compare times 3470 3471 // this barrier should be invisible to a customer and to the threading profile 3472 // tool (it's neither a terminating barrier nor customer's code, it's 3473 // used for an internal purpose) 3474 #if OMPT_SUPPORT 3475 // JP: can this barrier potentially leed to task scheduling? 3476 // JP: as long as there is a barrier in the implementation, OMPT should and 3477 // will provide the barrier events 3478 // so we set-up the necessary frame/return addresses. 3479 ompt_frame_t *ompt_frame; 3480 if (ompt_enabled.enabled) { 3481 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3482 if (ompt_frame->enter_frame.ptr == NULL) 3483 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 3484 OMPT_STORE_RETURN_ADDRESS(global_tid); 3485 } 3486 #endif 3487 #if USE_ITT_NOTIFY 3488 __kmp_threads[global_tid]->th.th_ident = loc; 3489 #endif 3490 retval = 3491 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3492 global_tid, FALSE, reduce_size, reduce_data, reduce_func); 3493 retval = (retval != 0) ? (0) : (1); 3494 #if OMPT_SUPPORT && OMPT_OPTIONAL 3495 if (ompt_enabled.enabled) { 3496 ompt_frame->enter_frame = ompt_data_none; 3497 } 3498 #endif 3499 3500 // all other workers except master should do this pop here 3501 // ( none of other workers will get to __kmpc_end_reduce_nowait() ) 3502 if (__kmp_env_consistency_check) { 3503 if (retval == 0) { 3504 __kmp_pop_sync(global_tid, ct_reduce, loc); 3505 } 3506 } 3507 3508 } else { 3509 3510 // should never reach this block 3511 KMP_ASSERT(0); // "unexpected method" 3512 } 3513 if (teams_swapped) { 3514 __kmp_restore_swapped_teams(th, team, task_state); 3515 } 3516 KA_TRACE( 3517 10, 3518 ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", 3519 global_tid, packed_reduction_method, retval)); 3520 3521 return retval; 3522 } 3523 3524 /*! 3525 @ingroup SYNCHRONIZATION 3526 @param loc source location information 3527 @param global_tid global thread id. 3528 @param lck pointer to the unique lock data structure 3529 3530 Finish the execution of a reduce nowait. 3531 */ 3532 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, 3533 kmp_critical_name *lck) { 3534 3535 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3536 3537 KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid)); 3538 3539 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 3540 3541 OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid); 3542 3543 if (packed_reduction_method == critical_reduce_block) { 3544 3545 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 3546 OMPT_REDUCTION_END; 3547 3548 } else if (packed_reduction_method == empty_reduce_block) { 3549 3550 // usage: if team size == 1, no synchronization is required ( on Intel 3551 // platforms only ) 3552 3553 OMPT_REDUCTION_END; 3554 3555 } else if (packed_reduction_method == atomic_reduce_block) { 3556 3557 // neither master nor other workers should get here 3558 // (code gen does not generate this call in case 2: atomic reduce block) 3559 // actually it's better to remove this elseif at all; 3560 // after removal this value will checked by the 'else' and will assert 3561 3562 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3563 tree_reduce_block)) { 3564 3565 // only master gets here 3566 // OMPT: tree reduction is annotated in the barrier code 3567 3568 } else { 3569 3570 // should never reach this block 3571 KMP_ASSERT(0); // "unexpected method" 3572 } 3573 3574 if (__kmp_env_consistency_check) 3575 __kmp_pop_sync(global_tid, ct_reduce, loc); 3576 3577 KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", 3578 global_tid, packed_reduction_method)); 3579 3580 return; 3581 } 3582 3583 /* 2.a.ii. Reduce Block with a terminating barrier */ 3584 3585 /*! 3586 @ingroup SYNCHRONIZATION 3587 @param loc source location information 3588 @param global_tid global thread number 3589 @param num_vars number of items (variables) to be reduced 3590 @param reduce_size size of data in bytes to be reduced 3591 @param reduce_data pointer to data to be reduced 3592 @param reduce_func callback function providing reduction operation on two 3593 operands and returning result of reduction in lhs_data 3594 @param lck pointer to the unique lock data structure 3595 @result 1 for the master thread, 0 for all other team threads, 2 for all team 3596 threads if atomic reduction needed 3597 3598 A blocking reduce that includes an implicit barrier. 3599 */ 3600 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 3601 size_t reduce_size, void *reduce_data, 3602 void (*reduce_func)(void *lhs_data, void *rhs_data), 3603 kmp_critical_name *lck) { 3604 KMP_COUNT_BLOCK(REDUCE_wait); 3605 int retval = 0; 3606 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3607 kmp_info_t *th; 3608 kmp_team_t *team; 3609 int teams_swapped = 0, task_state; 3610 3611 KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid)); 3612 3613 // why do we need this initialization here at all? 3614 // Reduction clause can not be a stand-alone directive. 3615 3616 // do not call __kmp_serial_initialize(), it will be called by 3617 // __kmp_parallel_initialize() if needed 3618 // possible detection of false-positive race by the threadchecker ??? 3619 if (!TCR_4(__kmp_init_parallel)) 3620 __kmp_parallel_initialize(); 3621 3622 __kmp_resume_if_soft_paused(); 3623 3624 // check correctness of reduce block nesting 3625 #if KMP_USE_DYNAMIC_LOCK 3626 if (__kmp_env_consistency_check) 3627 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 3628 #else 3629 if (__kmp_env_consistency_check) 3630 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 3631 #endif 3632 3633 th = __kmp_thread_from_gtid(global_tid); 3634 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3635 3636 packed_reduction_method = __kmp_determine_reduction_method( 3637 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 3638 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 3639 3640 OMPT_REDUCTION_DECL(th, global_tid); 3641 3642 if (packed_reduction_method == critical_reduce_block) { 3643 3644 OMPT_REDUCTION_BEGIN; 3645 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 3646 retval = 1; 3647 3648 } else if (packed_reduction_method == empty_reduce_block) { 3649 3650 OMPT_REDUCTION_BEGIN; 3651 // usage: if team size == 1, no synchronization is required ( Intel 3652 // platforms only ) 3653 retval = 1; 3654 3655 } else if (packed_reduction_method == atomic_reduce_block) { 3656 3657 retval = 2; 3658 3659 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3660 tree_reduce_block)) { 3661 3662 // case tree_reduce_block: 3663 // this barrier should be visible to a customer and to the threading profile 3664 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 3665 #if OMPT_SUPPORT 3666 ompt_frame_t *ompt_frame; 3667 if (ompt_enabled.enabled) { 3668 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3669 if (ompt_frame->enter_frame.ptr == NULL) 3670 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 3671 OMPT_STORE_RETURN_ADDRESS(global_tid); 3672 } 3673 #endif 3674 #if USE_ITT_NOTIFY 3675 __kmp_threads[global_tid]->th.th_ident = 3676 loc; // needed for correct notification of frames 3677 #endif 3678 retval = 3679 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3680 global_tid, TRUE, reduce_size, reduce_data, reduce_func); 3681 retval = (retval != 0) ? (0) : (1); 3682 #if OMPT_SUPPORT && OMPT_OPTIONAL 3683 if (ompt_enabled.enabled) { 3684 ompt_frame->enter_frame = ompt_data_none; 3685 } 3686 #endif 3687 3688 // all other workers except master should do this pop here 3689 // ( none of other workers except master will enter __kmpc_end_reduce() ) 3690 if (__kmp_env_consistency_check) { 3691 if (retval == 0) { // 0: all other workers; 1: master 3692 __kmp_pop_sync(global_tid, ct_reduce, loc); 3693 } 3694 } 3695 3696 } else { 3697 3698 // should never reach this block 3699 KMP_ASSERT(0); // "unexpected method" 3700 } 3701 if (teams_swapped) { 3702 __kmp_restore_swapped_teams(th, team, task_state); 3703 } 3704 3705 KA_TRACE(10, 3706 ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", 3707 global_tid, packed_reduction_method, retval)); 3708 return retval; 3709 } 3710 3711 /*! 3712 @ingroup SYNCHRONIZATION 3713 @param loc source location information 3714 @param global_tid global thread id. 3715 @param lck pointer to the unique lock data structure 3716 3717 Finish the execution of a blocking reduce. 3718 The <tt>lck</tt> pointer must be the same as that used in the corresponding 3719 start function. 3720 */ 3721 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, 3722 kmp_critical_name *lck) { 3723 3724 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3725 kmp_info_t *th; 3726 kmp_team_t *team; 3727 int teams_swapped = 0, task_state; 3728 3729 KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid)); 3730 3731 th = __kmp_thread_from_gtid(global_tid); 3732 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3733 3734 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 3735 3736 // this barrier should be visible to a customer and to the threading profile 3737 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 3738 OMPT_REDUCTION_DECL(th, global_tid); 3739 3740 if (packed_reduction_method == critical_reduce_block) { 3741 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 3742 3743 OMPT_REDUCTION_END; 3744 3745 // TODO: implicit barrier: should be exposed 3746 #if OMPT_SUPPORT 3747 ompt_frame_t *ompt_frame; 3748 if (ompt_enabled.enabled) { 3749 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3750 if (ompt_frame->enter_frame.ptr == NULL) 3751 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 3752 OMPT_STORE_RETURN_ADDRESS(global_tid); 3753 } 3754 #endif 3755 #if USE_ITT_NOTIFY 3756 __kmp_threads[global_tid]->th.th_ident = loc; 3757 #endif 3758 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3759 #if OMPT_SUPPORT && OMPT_OPTIONAL 3760 if (ompt_enabled.enabled) { 3761 ompt_frame->enter_frame = ompt_data_none; 3762 } 3763 #endif 3764 3765 } else if (packed_reduction_method == empty_reduce_block) { 3766 3767 OMPT_REDUCTION_END; 3768 3769 // usage: if team size==1, no synchronization is required (Intel platforms only) 3770 3771 // TODO: implicit barrier: should be exposed 3772 #if OMPT_SUPPORT 3773 ompt_frame_t *ompt_frame; 3774 if (ompt_enabled.enabled) { 3775 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3776 if (ompt_frame->enter_frame.ptr == NULL) 3777 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 3778 OMPT_STORE_RETURN_ADDRESS(global_tid); 3779 } 3780 #endif 3781 #if USE_ITT_NOTIFY 3782 __kmp_threads[global_tid]->th.th_ident = loc; 3783 #endif 3784 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3785 #if OMPT_SUPPORT && OMPT_OPTIONAL 3786 if (ompt_enabled.enabled) { 3787 ompt_frame->enter_frame = ompt_data_none; 3788 } 3789 #endif 3790 3791 } else if (packed_reduction_method == atomic_reduce_block) { 3792 3793 #if OMPT_SUPPORT 3794 ompt_frame_t *ompt_frame; 3795 if (ompt_enabled.enabled) { 3796 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3797 if (ompt_frame->enter_frame.ptr == NULL) 3798 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 3799 OMPT_STORE_RETURN_ADDRESS(global_tid); 3800 } 3801 #endif 3802 // TODO: implicit barrier: should be exposed 3803 #if USE_ITT_NOTIFY 3804 __kmp_threads[global_tid]->th.th_ident = loc; 3805 #endif 3806 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3807 #if OMPT_SUPPORT && OMPT_OPTIONAL 3808 if (ompt_enabled.enabled) { 3809 ompt_frame->enter_frame = ompt_data_none; 3810 } 3811 #endif 3812 3813 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3814 tree_reduce_block)) { 3815 3816 // only master executes here (master releases all other workers) 3817 __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3818 global_tid); 3819 3820 } else { 3821 3822 // should never reach this block 3823 KMP_ASSERT(0); // "unexpected method" 3824 } 3825 if (teams_swapped) { 3826 __kmp_restore_swapped_teams(th, team, task_state); 3827 } 3828 3829 if (__kmp_env_consistency_check) 3830 __kmp_pop_sync(global_tid, ct_reduce, loc); 3831 3832 KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n", 3833 global_tid, packed_reduction_method)); 3834 3835 return; 3836 } 3837 3838 #undef __KMP_GET_REDUCTION_METHOD 3839 #undef __KMP_SET_REDUCTION_METHOD 3840 3841 /* end of interface to fast scalable reduce routines */ 3842 3843 kmp_uint64 __kmpc_get_taskid() { 3844 3845 kmp_int32 gtid; 3846 kmp_info_t *thread; 3847 3848 gtid = __kmp_get_gtid(); 3849 if (gtid < 0) { 3850 return 0; 3851 } 3852 thread = __kmp_thread_from_gtid(gtid); 3853 return thread->th.th_current_task->td_task_id; 3854 3855 } // __kmpc_get_taskid 3856 3857 kmp_uint64 __kmpc_get_parent_taskid() { 3858 3859 kmp_int32 gtid; 3860 kmp_info_t *thread; 3861 kmp_taskdata_t *parent_task; 3862 3863 gtid = __kmp_get_gtid(); 3864 if (gtid < 0) { 3865 return 0; 3866 } 3867 thread = __kmp_thread_from_gtid(gtid); 3868 parent_task = thread->th.th_current_task->td_parent; 3869 return (parent_task == NULL ? 0 : parent_task->td_task_id); 3870 3871 } // __kmpc_get_parent_taskid 3872 3873 /*! 3874 @ingroup WORK_SHARING 3875 @param loc source location information. 3876 @param gtid global thread number. 3877 @param num_dims number of associated doacross loops. 3878 @param dims info on loops bounds. 3879 3880 Initialize doacross loop information. 3881 Expect compiler send us inclusive bounds, 3882 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2. 3883 */ 3884 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, 3885 const struct kmp_dim *dims) { 3886 int j, idx; 3887 kmp_int64 last, trace_count; 3888 kmp_info_t *th = __kmp_threads[gtid]; 3889 kmp_team_t *team = th->th.th_team; 3890 kmp_uint32 *flags; 3891 kmp_disp_t *pr_buf = th->th.th_dispatch; 3892 dispatch_shared_info_t *sh_buf; 3893 3894 KA_TRACE( 3895 20, 3896 ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n", 3897 gtid, num_dims, !team->t.t_serialized)); 3898 KMP_DEBUG_ASSERT(dims != NULL); 3899 KMP_DEBUG_ASSERT(num_dims > 0); 3900 3901 if (team->t.t_serialized) { 3902 KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n")); 3903 return; // no dependencies if team is serialized 3904 } 3905 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 3906 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for 3907 // the next loop 3908 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 3909 3910 // Save bounds info into allocated private buffer 3911 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL); 3912 pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc( 3913 th, sizeof(kmp_int64) * (4 * num_dims + 1)); 3914 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3915 pr_buf->th_doacross_info[0] = 3916 (kmp_int64)num_dims; // first element is number of dimensions 3917 // Save also address of num_done in order to access it later without knowing 3918 // the buffer index 3919 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done; 3920 pr_buf->th_doacross_info[2] = dims[0].lo; 3921 pr_buf->th_doacross_info[3] = dims[0].up; 3922 pr_buf->th_doacross_info[4] = dims[0].st; 3923 last = 5; 3924 for (j = 1; j < num_dims; ++j) { 3925 kmp_int64 3926 range_length; // To keep ranges of all dimensions but the first dims[0] 3927 if (dims[j].st == 1) { // most common case 3928 // AC: should we care of ranges bigger than LLONG_MAX? (not for now) 3929 range_length = dims[j].up - dims[j].lo + 1; 3930 } else { 3931 if (dims[j].st > 0) { 3932 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo); 3933 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1; 3934 } else { // negative increment 3935 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up); 3936 range_length = 3937 (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1; 3938 } 3939 } 3940 pr_buf->th_doacross_info[last++] = range_length; 3941 pr_buf->th_doacross_info[last++] = dims[j].lo; 3942 pr_buf->th_doacross_info[last++] = dims[j].up; 3943 pr_buf->th_doacross_info[last++] = dims[j].st; 3944 } 3945 3946 // Compute total trip count. 3947 // Start with range of dims[0] which we don't need to keep in the buffer. 3948 if (dims[0].st == 1) { // most common case 3949 trace_count = dims[0].up - dims[0].lo + 1; 3950 } else if (dims[0].st > 0) { 3951 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo); 3952 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1; 3953 } else { // negative increment 3954 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up); 3955 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1; 3956 } 3957 for (j = 1; j < num_dims; ++j) { 3958 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges 3959 } 3960 KMP_DEBUG_ASSERT(trace_count > 0); 3961 3962 // Check if shared buffer is not occupied by other loop (idx - 3963 // __kmp_dispatch_num_buffers) 3964 if (idx != sh_buf->doacross_buf_idx) { 3965 // Shared buffer is occupied, wait for it to be free 3966 __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx, 3967 __kmp_eq_4, NULL); 3968 } 3969 #if KMP_32_BIT_ARCH 3970 // Check if we are the first thread. After the CAS the first thread gets 0, 3971 // others get 1 if initialization is in progress, allocated pointer otherwise. 3972 // Treat pointer as volatile integer (value 0 or 1) until memory is allocated. 3973 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32( 3974 (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1); 3975 #else 3976 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64( 3977 (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL); 3978 #endif 3979 if (flags == NULL) { 3980 // we are the first thread, allocate the array of flags 3981 size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration 3982 flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1); 3983 KMP_MB(); 3984 sh_buf->doacross_flags = flags; 3985 } else if (flags == (kmp_uint32 *)1) { 3986 #if KMP_32_BIT_ARCH 3987 // initialization is still in progress, need to wait 3988 while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1) 3989 #else 3990 while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL) 3991 #endif 3992 KMP_YIELD(TRUE); 3993 KMP_MB(); 3994 } else { 3995 KMP_MB(); 3996 } 3997 KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value 3998 pr_buf->th_doacross_flags = 3999 sh_buf->doacross_flags; // save private copy in order to not 4000 // touch shared buffer on each iteration 4001 KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid)); 4002 } 4003 4004 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) { 4005 kmp_int32 shft, num_dims, i; 4006 kmp_uint32 flag; 4007 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 4008 kmp_info_t *th = __kmp_threads[gtid]; 4009 kmp_team_t *team = th->th.th_team; 4010 kmp_disp_t *pr_buf; 4011 kmp_int64 lo, up, st; 4012 4013 KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid)); 4014 if (team->t.t_serialized) { 4015 KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n")); 4016 return; // no dependencies if team is serialized 4017 } 4018 4019 // calculate sequential iteration number and check out-of-bounds condition 4020 pr_buf = th->th.th_dispatch; 4021 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 4022 num_dims = pr_buf->th_doacross_info[0]; 4023 lo = pr_buf->th_doacross_info[2]; 4024 up = pr_buf->th_doacross_info[3]; 4025 st = pr_buf->th_doacross_info[4]; 4026 if (st == 1) { // most common case 4027 if (vec[0] < lo || vec[0] > up) { 4028 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4029 "bounds [%lld,%lld]\n", 4030 gtid, vec[0], lo, up)); 4031 return; 4032 } 4033 iter_number = vec[0] - lo; 4034 } else if (st > 0) { 4035 if (vec[0] < lo || vec[0] > up) { 4036 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4037 "bounds [%lld,%lld]\n", 4038 gtid, vec[0], lo, up)); 4039 return; 4040 } 4041 iter_number = (kmp_uint64)(vec[0] - lo) / st; 4042 } else { // negative increment 4043 if (vec[0] > lo || vec[0] < up) { 4044 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4045 "bounds [%lld,%lld]\n", 4046 gtid, vec[0], lo, up)); 4047 return; 4048 } 4049 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 4050 } 4051 for (i = 1; i < num_dims; ++i) { 4052 kmp_int64 iter, ln; 4053 kmp_int32 j = i * 4; 4054 ln = pr_buf->th_doacross_info[j + 1]; 4055 lo = pr_buf->th_doacross_info[j + 2]; 4056 up = pr_buf->th_doacross_info[j + 3]; 4057 st = pr_buf->th_doacross_info[j + 4]; 4058 if (st == 1) { 4059 if (vec[i] < lo || vec[i] > up) { 4060 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4061 "bounds [%lld,%lld]\n", 4062 gtid, vec[i], lo, up)); 4063 return; 4064 } 4065 iter = vec[i] - lo; 4066 } else if (st > 0) { 4067 if (vec[i] < lo || vec[i] > up) { 4068 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4069 "bounds [%lld,%lld]\n", 4070 gtid, vec[i], lo, up)); 4071 return; 4072 } 4073 iter = (kmp_uint64)(vec[i] - lo) / st; 4074 } else { // st < 0 4075 if (vec[i] > lo || vec[i] < up) { 4076 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4077 "bounds [%lld,%lld]\n", 4078 gtid, vec[i], lo, up)); 4079 return; 4080 } 4081 iter = (kmp_uint64)(lo - vec[i]) / (-st); 4082 } 4083 iter_number = iter + ln * iter_number; 4084 } 4085 shft = iter_number % 32; // use 32-bit granularity 4086 iter_number >>= 5; // divided by 32 4087 flag = 1 << shft; 4088 while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) { 4089 KMP_YIELD(TRUE); 4090 } 4091 KMP_MB(); 4092 KA_TRACE(20, 4093 ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n", 4094 gtid, (iter_number << 5) + shft)); 4095 } 4096 4097 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) { 4098 kmp_int32 shft, num_dims, i; 4099 kmp_uint32 flag; 4100 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 4101 kmp_info_t *th = __kmp_threads[gtid]; 4102 kmp_team_t *team = th->th.th_team; 4103 kmp_disp_t *pr_buf; 4104 kmp_int64 lo, st; 4105 4106 KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid)); 4107 if (team->t.t_serialized) { 4108 KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n")); 4109 return; // no dependencies if team is serialized 4110 } 4111 4112 // calculate sequential iteration number (same as in "wait" but no 4113 // out-of-bounds checks) 4114 pr_buf = th->th.th_dispatch; 4115 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 4116 num_dims = pr_buf->th_doacross_info[0]; 4117 lo = pr_buf->th_doacross_info[2]; 4118 st = pr_buf->th_doacross_info[4]; 4119 if (st == 1) { // most common case 4120 iter_number = vec[0] - lo; 4121 } else if (st > 0) { 4122 iter_number = (kmp_uint64)(vec[0] - lo) / st; 4123 } else { // negative increment 4124 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 4125 } 4126 for (i = 1; i < num_dims; ++i) { 4127 kmp_int64 iter, ln; 4128 kmp_int32 j = i * 4; 4129 ln = pr_buf->th_doacross_info[j + 1]; 4130 lo = pr_buf->th_doacross_info[j + 2]; 4131 st = pr_buf->th_doacross_info[j + 4]; 4132 if (st == 1) { 4133 iter = vec[i] - lo; 4134 } else if (st > 0) { 4135 iter = (kmp_uint64)(vec[i] - lo) / st; 4136 } else { // st < 0 4137 iter = (kmp_uint64)(lo - vec[i]) / (-st); 4138 } 4139 iter_number = iter + ln * iter_number; 4140 } 4141 shft = iter_number % 32; // use 32-bit granularity 4142 iter_number >>= 5; // divided by 32 4143 flag = 1 << shft; 4144 KMP_MB(); 4145 if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) 4146 KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag); 4147 KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid, 4148 (iter_number << 5) + shft)); 4149 } 4150 4151 void __kmpc_doacross_fini(ident_t *loc, int gtid) { 4152 kmp_int32 num_done; 4153 kmp_info_t *th = __kmp_threads[gtid]; 4154 kmp_team_t *team = th->th.th_team; 4155 kmp_disp_t *pr_buf = th->th.th_dispatch; 4156 4157 KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid)); 4158 if (team->t.t_serialized) { 4159 KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team)); 4160 return; // nothing to do 4161 } 4162 num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1; 4163 if (num_done == th->th.th_team_nproc) { 4164 // we are the last thread, need to free shared resources 4165 int idx = pr_buf->th_doacross_buf_idx - 1; 4166 dispatch_shared_info_t *sh_buf = 4167 &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 4168 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == 4169 (kmp_int64)&sh_buf->doacross_num_done); 4170 KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done); 4171 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx); 4172 __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags)); 4173 sh_buf->doacross_flags = NULL; 4174 sh_buf->doacross_num_done = 0; 4175 sh_buf->doacross_buf_idx += 4176 __kmp_dispatch_num_buffers; // free buffer for future re-use 4177 } 4178 // free private resources (need to keep buffer index forever) 4179 pr_buf->th_doacross_flags = NULL; 4180 __kmp_thread_free(th, (void *)pr_buf->th_doacross_info); 4181 pr_buf->th_doacross_info = NULL; 4182 KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid)); 4183 } 4184 4185 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */ 4186 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) { 4187 return __kmpc_alloc(__kmp_entry_gtid(), size, allocator); 4188 } 4189 4190 void omp_free(void *ptr, omp_allocator_handle_t allocator) { 4191 __kmpc_free(__kmp_entry_gtid(), ptr, allocator); 4192 } 4193 4194 int __kmpc_get_target_offload(void) { 4195 if (!__kmp_init_serial) { 4196 __kmp_serial_initialize(); 4197 } 4198 return __kmp_target_offload; 4199 } 4200 4201 int __kmpc_pause_resource(kmp_pause_status_t level) { 4202 if (!__kmp_init_serial) { 4203 return 1; // Can't pause if runtime is not initialized 4204 } 4205 return __kmp_pause_resource(level); 4206 } 4207