xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_lock.cpp (revision 397e83df75e0fcd0d3fcb95ae4d794cb7600fc89)
1 /*
2  * kmp_lock.cpp -- lock-related functions
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <stddef.h>
14 #include <atomic>
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_io.h"
19 #include "kmp_itt.h"
20 #include "kmp_lock.h"
21 #include "kmp_wait_release.h"
22 #include "kmp_wrapper_getpid.h"
23 
24 #if KMP_USE_FUTEX
25 #include <sys/syscall.h>
26 #include <unistd.h>
27 // We should really include <futex.h>, but that causes compatibility problems on
28 // different Linux* OS distributions that either require that you include (or
29 // break when you try to include) <pci/types.h>. Since all we need is the two
30 // macros below (which are part of the kernel ABI, so can't change) we just
31 // define the constants here and don't include <futex.h>
32 #ifndef FUTEX_WAIT
33 #define FUTEX_WAIT 0
34 #endif
35 #ifndef FUTEX_WAKE
36 #define FUTEX_WAKE 1
37 #endif
38 #endif
39 
40 /* Implement spin locks for internal library use.             */
41 /* The algorithm implemented is Lamport's bakery lock [1974]. */
42 
43 void __kmp_validate_locks(void) {
44   int i;
45   kmp_uint32 x, y;
46 
47   /* Check to make sure unsigned arithmetic does wraps properly */
48   x = ~((kmp_uint32)0) - 2;
49   y = x - 2;
50 
51   for (i = 0; i < 8; ++i, ++x, ++y) {
52     kmp_uint32 z = (x - y);
53     KMP_ASSERT(z == 2);
54   }
55 
56   KMP_ASSERT(offsetof(kmp_base_queuing_lock, tail_id) % 8 == 0);
57 }
58 
59 /* ------------------------------------------------------------------------ */
60 /* test and set locks */
61 
62 // For the non-nested locks, we can only assume that the first 4 bytes were
63 // allocated, since gcc only allocates 4 bytes for omp_lock_t, and the Intel
64 // compiler only allocates a 4 byte pointer on IA-32 architecture.  On
65 // Windows* OS on Intel(R) 64, we can assume that all 8 bytes were allocated.
66 //
67 // gcc reserves >= 8 bytes for nested locks, so we can assume that the
68 // entire 8 bytes were allocated for nested locks on all 64-bit platforms.
69 
70 static kmp_int32 __kmp_get_tas_lock_owner(kmp_tas_lock_t *lck) {
71   return KMP_LOCK_STRIP(KMP_ATOMIC_LD_RLX(&lck->lk.poll)) - 1;
72 }
73 
74 static inline bool __kmp_is_tas_lock_nestable(kmp_tas_lock_t *lck) {
75   return lck->lk.depth_locked != -1;
76 }
77 
78 __forceinline static int
79 __kmp_acquire_tas_lock_timed_template(kmp_tas_lock_t *lck, kmp_int32 gtid) {
80   KMP_MB();
81 
82 #ifdef USE_LOCK_PROFILE
83   kmp_uint32 curr = KMP_LOCK_STRIP(lck->lk.poll);
84   if ((curr != 0) && (curr != gtid + 1))
85     __kmp_printf("LOCK CONTENTION: %p\n", lck);
86 /* else __kmp_printf( "." );*/
87 #endif /* USE_LOCK_PROFILE */
88 
89   kmp_int32 tas_free = KMP_LOCK_FREE(tas);
90   kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);
91 
92   if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == tas_free &&
93       __kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)) {
94     KMP_FSYNC_ACQUIRED(lck);
95     return KMP_LOCK_ACQUIRED_FIRST;
96   }
97 
98   kmp_uint32 spins;
99   kmp_uint64 time;
100   KMP_FSYNC_PREPARE(lck);
101   KMP_INIT_YIELD(spins);
102   KMP_INIT_BACKOFF(time);
103   kmp_backoff_t backoff = __kmp_spin_backoff_params;
104   do {
105 #if !KMP_HAVE_UMWAIT
106     __kmp_spin_backoff(&backoff);
107 #else
108     if (!__kmp_tpause_enabled)
109       __kmp_spin_backoff(&backoff);
110 #endif
111     KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
112   } while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != tas_free ||
113            !__kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy));
114   KMP_FSYNC_ACQUIRED(lck);
115   return KMP_LOCK_ACQUIRED_FIRST;
116 }
117 
118 int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
119   int retval = __kmp_acquire_tas_lock_timed_template(lck, gtid);
120   return retval;
121 }
122 
123 static int __kmp_acquire_tas_lock_with_checks(kmp_tas_lock_t *lck,
124                                               kmp_int32 gtid) {
125   char const *const func = "omp_set_lock";
126   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
127       __kmp_is_tas_lock_nestable(lck)) {
128     KMP_FATAL(LockNestableUsedAsSimple, func);
129   }
130   if ((gtid >= 0) && (__kmp_get_tas_lock_owner(lck) == gtid)) {
131     KMP_FATAL(LockIsAlreadyOwned, func);
132   }
133   return __kmp_acquire_tas_lock(lck, gtid);
134 }
135 
136 int __kmp_test_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
137   kmp_int32 tas_free = KMP_LOCK_FREE(tas);
138   kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);
139   if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == tas_free &&
140       __kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)) {
141     KMP_FSYNC_ACQUIRED(lck);
142     return TRUE;
143   }
144   return FALSE;
145 }
146 
147 static int __kmp_test_tas_lock_with_checks(kmp_tas_lock_t *lck,
148                                            kmp_int32 gtid) {
149   char const *const func = "omp_test_lock";
150   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
151       __kmp_is_tas_lock_nestable(lck)) {
152     KMP_FATAL(LockNestableUsedAsSimple, func);
153   }
154   return __kmp_test_tas_lock(lck, gtid);
155 }
156 
157 int __kmp_release_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
158   KMP_MB(); /* Flush all pending memory write invalidates.  */
159 
160   KMP_FSYNC_RELEASING(lck);
161   KMP_ATOMIC_ST_REL(&lck->lk.poll, KMP_LOCK_FREE(tas));
162   KMP_MB(); /* Flush all pending memory write invalidates.  */
163 
164   KMP_YIELD_OVERSUB();
165   return KMP_LOCK_RELEASED;
166 }
167 
168 static int __kmp_release_tas_lock_with_checks(kmp_tas_lock_t *lck,
169                                               kmp_int32 gtid) {
170   char const *const func = "omp_unset_lock";
171   KMP_MB(); /* in case another processor initialized lock */
172   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
173       __kmp_is_tas_lock_nestable(lck)) {
174     KMP_FATAL(LockNestableUsedAsSimple, func);
175   }
176   if (__kmp_get_tas_lock_owner(lck) == -1) {
177     KMP_FATAL(LockUnsettingFree, func);
178   }
179   if ((gtid >= 0) && (__kmp_get_tas_lock_owner(lck) >= 0) &&
180       (__kmp_get_tas_lock_owner(lck) != gtid)) {
181     KMP_FATAL(LockUnsettingSetByAnother, func);
182   }
183   return __kmp_release_tas_lock(lck, gtid);
184 }
185 
186 void __kmp_init_tas_lock(kmp_tas_lock_t *lck) {
187   lck->lk.poll = KMP_LOCK_FREE(tas);
188 }
189 
190 void __kmp_destroy_tas_lock(kmp_tas_lock_t *lck) { lck->lk.poll = 0; }
191 
192 static void __kmp_destroy_tas_lock_with_checks(kmp_tas_lock_t *lck) {
193   char const *const func = "omp_destroy_lock";
194   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
195       __kmp_is_tas_lock_nestable(lck)) {
196     KMP_FATAL(LockNestableUsedAsSimple, func);
197   }
198   if (__kmp_get_tas_lock_owner(lck) != -1) {
199     KMP_FATAL(LockStillOwned, func);
200   }
201   __kmp_destroy_tas_lock(lck);
202 }
203 
204 // nested test and set locks
205 
206 int __kmp_acquire_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
207   KMP_DEBUG_ASSERT(gtid >= 0);
208 
209   if (__kmp_get_tas_lock_owner(lck) == gtid) {
210     lck->lk.depth_locked += 1;
211     return KMP_LOCK_ACQUIRED_NEXT;
212   } else {
213     __kmp_acquire_tas_lock_timed_template(lck, gtid);
214     lck->lk.depth_locked = 1;
215     return KMP_LOCK_ACQUIRED_FIRST;
216   }
217 }
218 
219 static int __kmp_acquire_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
220                                                      kmp_int32 gtid) {
221   char const *const func = "omp_set_nest_lock";
222   if (!__kmp_is_tas_lock_nestable(lck)) {
223     KMP_FATAL(LockSimpleUsedAsNestable, func);
224   }
225   return __kmp_acquire_nested_tas_lock(lck, gtid);
226 }
227 
228 int __kmp_test_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
229   int retval;
230 
231   KMP_DEBUG_ASSERT(gtid >= 0);
232 
233   if (__kmp_get_tas_lock_owner(lck) == gtid) {
234     retval = ++lck->lk.depth_locked;
235   } else if (!__kmp_test_tas_lock(lck, gtid)) {
236     retval = 0;
237   } else {
238     KMP_MB();
239     retval = lck->lk.depth_locked = 1;
240   }
241   return retval;
242 }
243 
244 static int __kmp_test_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
245                                                   kmp_int32 gtid) {
246   char const *const func = "omp_test_nest_lock";
247   if (!__kmp_is_tas_lock_nestable(lck)) {
248     KMP_FATAL(LockSimpleUsedAsNestable, func);
249   }
250   return __kmp_test_nested_tas_lock(lck, gtid);
251 }
252 
253 int __kmp_release_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
254   KMP_DEBUG_ASSERT(gtid >= 0);
255 
256   KMP_MB();
257   if (--(lck->lk.depth_locked) == 0) {
258     __kmp_release_tas_lock(lck, gtid);
259     return KMP_LOCK_RELEASED;
260   }
261   return KMP_LOCK_STILL_HELD;
262 }
263 
264 static int __kmp_release_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
265                                                      kmp_int32 gtid) {
266   char const *const func = "omp_unset_nest_lock";
267   KMP_MB(); /* in case another processor initialized lock */
268   if (!__kmp_is_tas_lock_nestable(lck)) {
269     KMP_FATAL(LockSimpleUsedAsNestable, func);
270   }
271   if (__kmp_get_tas_lock_owner(lck) == -1) {
272     KMP_FATAL(LockUnsettingFree, func);
273   }
274   if (__kmp_get_tas_lock_owner(lck) != gtid) {
275     KMP_FATAL(LockUnsettingSetByAnother, func);
276   }
277   return __kmp_release_nested_tas_lock(lck, gtid);
278 }
279 
280 void __kmp_init_nested_tas_lock(kmp_tas_lock_t *lck) {
281   __kmp_init_tas_lock(lck);
282   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
283 }
284 
285 void __kmp_destroy_nested_tas_lock(kmp_tas_lock_t *lck) {
286   __kmp_destroy_tas_lock(lck);
287   lck->lk.depth_locked = 0;
288 }
289 
290 static void __kmp_destroy_nested_tas_lock_with_checks(kmp_tas_lock_t *lck) {
291   char const *const func = "omp_destroy_nest_lock";
292   if (!__kmp_is_tas_lock_nestable(lck)) {
293     KMP_FATAL(LockSimpleUsedAsNestable, func);
294   }
295   if (__kmp_get_tas_lock_owner(lck) != -1) {
296     KMP_FATAL(LockStillOwned, func);
297   }
298   __kmp_destroy_nested_tas_lock(lck);
299 }
300 
301 #if KMP_USE_FUTEX
302 
303 /* ------------------------------------------------------------------------ */
304 /* futex locks */
305 
306 // futex locks are really just test and set locks, with a different method
307 // of handling contention.  They take the same amount of space as test and
308 // set locks, and are allocated the same way (i.e. use the area allocated by
309 // the compiler for non-nested locks / allocate nested locks on the heap).
310 
311 static kmp_int32 __kmp_get_futex_lock_owner(kmp_futex_lock_t *lck) {
312   return KMP_LOCK_STRIP((TCR_4(lck->lk.poll) >> 1)) - 1;
313 }
314 
315 static inline bool __kmp_is_futex_lock_nestable(kmp_futex_lock_t *lck) {
316   return lck->lk.depth_locked != -1;
317 }
318 
319 __forceinline static int
320 __kmp_acquire_futex_lock_timed_template(kmp_futex_lock_t *lck, kmp_int32 gtid) {
321   kmp_int32 gtid_code = (gtid + 1) << 1;
322 
323   KMP_MB();
324 
325 #ifdef USE_LOCK_PROFILE
326   kmp_uint32 curr = KMP_LOCK_STRIP(TCR_4(lck->lk.poll));
327   if ((curr != 0) && (curr != gtid_code))
328     __kmp_printf("LOCK CONTENTION: %p\n", lck);
329 /* else __kmp_printf( "." );*/
330 #endif /* USE_LOCK_PROFILE */
331 
332   KMP_FSYNC_PREPARE(lck);
333   KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d entering\n",
334                   lck, lck->lk.poll, gtid));
335 
336   kmp_int32 poll_val;
337 
338   while ((poll_val = KMP_COMPARE_AND_STORE_RET32(
339               &(lck->lk.poll), KMP_LOCK_FREE(futex),
340               KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {
341 
342     kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;
343     KA_TRACE(
344         1000,
345         ("__kmp_acquire_futex_lock: lck:%p, T#%d poll_val = 0x%x cond = 0x%x\n",
346          lck, gtid, poll_val, cond));
347 
348     // NOTE: if you try to use the following condition for this branch
349     //
350     // if ( poll_val & 1 == 0 )
351     //
352     // Then the 12.0 compiler has a bug where the following block will
353     // always be skipped, regardless of the value of the LSB of poll_val.
354     if (!cond) {
355       // Try to set the lsb in the poll to indicate to the owner
356       // thread that they need to wake this thread up.
357       if (!KMP_COMPARE_AND_STORE_REL32(&(lck->lk.poll), poll_val,
358                                        poll_val | KMP_LOCK_BUSY(1, futex))) {
359         KA_TRACE(
360             1000,
361             ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d can't set bit 0\n",
362              lck, lck->lk.poll, gtid));
363         continue;
364       }
365       poll_val |= KMP_LOCK_BUSY(1, futex);
366 
367       KA_TRACE(1000,
368                ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d bit 0 set\n", lck,
369                 lck->lk.poll, gtid));
370     }
371 
372     KA_TRACE(
373         1000,
374         ("__kmp_acquire_futex_lock: lck:%p, T#%d before futex_wait(0x%x)\n",
375          lck, gtid, poll_val));
376 
377     long rc;
378     if ((rc = syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAIT, poll_val, NULL,
379                       NULL, 0)) != 0) {
380       KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p, T#%d futex_wait(0x%x) "
381                       "failed (rc=%ld errno=%d)\n",
382                       lck, gtid, poll_val, rc, errno));
383       continue;
384     }
385 
386     KA_TRACE(1000,
387              ("__kmp_acquire_futex_lock: lck:%p, T#%d after futex_wait(0x%x)\n",
388               lck, gtid, poll_val));
389     // This thread has now done a successful futex wait call and was entered on
390     // the OS futex queue.  We must now perform a futex wake call when releasing
391     // the lock, as we have no idea how many other threads are in the queue.
392     gtid_code |= 1;
393   }
394 
395   KMP_FSYNC_ACQUIRED(lck);
396   KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d exiting\n", lck,
397                   lck->lk.poll, gtid));
398   return KMP_LOCK_ACQUIRED_FIRST;
399 }
400 
401 int __kmp_acquire_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
402   int retval = __kmp_acquire_futex_lock_timed_template(lck, gtid);
403   return retval;
404 }
405 
406 static int __kmp_acquire_futex_lock_with_checks(kmp_futex_lock_t *lck,
407                                                 kmp_int32 gtid) {
408   char const *const func = "omp_set_lock";
409   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
410       __kmp_is_futex_lock_nestable(lck)) {
411     KMP_FATAL(LockNestableUsedAsSimple, func);
412   }
413   if ((gtid >= 0) && (__kmp_get_futex_lock_owner(lck) == gtid)) {
414     KMP_FATAL(LockIsAlreadyOwned, func);
415   }
416   return __kmp_acquire_futex_lock(lck, gtid);
417 }
418 
419 int __kmp_test_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
420   if (KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(futex),
421                                   KMP_LOCK_BUSY((gtid + 1) << 1, futex))) {
422     KMP_FSYNC_ACQUIRED(lck);
423     return TRUE;
424   }
425   return FALSE;
426 }
427 
428 static int __kmp_test_futex_lock_with_checks(kmp_futex_lock_t *lck,
429                                              kmp_int32 gtid) {
430   char const *const func = "omp_test_lock";
431   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
432       __kmp_is_futex_lock_nestable(lck)) {
433     KMP_FATAL(LockNestableUsedAsSimple, func);
434   }
435   return __kmp_test_futex_lock(lck, gtid);
436 }
437 
438 int __kmp_release_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
439   KMP_MB(); /* Flush all pending memory write invalidates.  */
440 
441   KA_TRACE(1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d entering\n",
442                   lck, lck->lk.poll, gtid));
443 
444   KMP_FSYNC_RELEASING(lck);
445 
446   kmp_int32 poll_val = KMP_XCHG_FIXED32(&(lck->lk.poll), KMP_LOCK_FREE(futex));
447 
448   KA_TRACE(1000,
449            ("__kmp_release_futex_lock: lck:%p, T#%d released poll_val = 0x%x\n",
450             lck, gtid, poll_val));
451 
452   if (KMP_LOCK_STRIP(poll_val) & 1) {
453     KA_TRACE(1000,
454              ("__kmp_release_futex_lock: lck:%p, T#%d futex_wake 1 thread\n",
455               lck, gtid));
456     syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex),
457             NULL, NULL, 0);
458   }
459 
460   KMP_MB(); /* Flush all pending memory write invalidates.  */
461 
462   KA_TRACE(1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d exiting\n", lck,
463                   lck->lk.poll, gtid));
464 
465   KMP_YIELD_OVERSUB();
466   return KMP_LOCK_RELEASED;
467 }
468 
469 static int __kmp_release_futex_lock_with_checks(kmp_futex_lock_t *lck,
470                                                 kmp_int32 gtid) {
471   char const *const func = "omp_unset_lock";
472   KMP_MB(); /* in case another processor initialized lock */
473   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
474       __kmp_is_futex_lock_nestable(lck)) {
475     KMP_FATAL(LockNestableUsedAsSimple, func);
476   }
477   if (__kmp_get_futex_lock_owner(lck) == -1) {
478     KMP_FATAL(LockUnsettingFree, func);
479   }
480   if ((gtid >= 0) && (__kmp_get_futex_lock_owner(lck) >= 0) &&
481       (__kmp_get_futex_lock_owner(lck) != gtid)) {
482     KMP_FATAL(LockUnsettingSetByAnother, func);
483   }
484   return __kmp_release_futex_lock(lck, gtid);
485 }
486 
487 void __kmp_init_futex_lock(kmp_futex_lock_t *lck) {
488   TCW_4(lck->lk.poll, KMP_LOCK_FREE(futex));
489 }
490 
491 void __kmp_destroy_futex_lock(kmp_futex_lock_t *lck) { lck->lk.poll = 0; }
492 
493 static void __kmp_destroy_futex_lock_with_checks(kmp_futex_lock_t *lck) {
494   char const *const func = "omp_destroy_lock";
495   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
496       __kmp_is_futex_lock_nestable(lck)) {
497     KMP_FATAL(LockNestableUsedAsSimple, func);
498   }
499   if (__kmp_get_futex_lock_owner(lck) != -1) {
500     KMP_FATAL(LockStillOwned, func);
501   }
502   __kmp_destroy_futex_lock(lck);
503 }
504 
505 // nested futex locks
506 
507 int __kmp_acquire_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
508   KMP_DEBUG_ASSERT(gtid >= 0);
509 
510   if (__kmp_get_futex_lock_owner(lck) == gtid) {
511     lck->lk.depth_locked += 1;
512     return KMP_LOCK_ACQUIRED_NEXT;
513   } else {
514     __kmp_acquire_futex_lock_timed_template(lck, gtid);
515     lck->lk.depth_locked = 1;
516     return KMP_LOCK_ACQUIRED_FIRST;
517   }
518 }
519 
520 static int __kmp_acquire_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
521                                                        kmp_int32 gtid) {
522   char const *const func = "omp_set_nest_lock";
523   if (!__kmp_is_futex_lock_nestable(lck)) {
524     KMP_FATAL(LockSimpleUsedAsNestable, func);
525   }
526   return __kmp_acquire_nested_futex_lock(lck, gtid);
527 }
528 
529 int __kmp_test_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
530   int retval;
531 
532   KMP_DEBUG_ASSERT(gtid >= 0);
533 
534   if (__kmp_get_futex_lock_owner(lck) == gtid) {
535     retval = ++lck->lk.depth_locked;
536   } else if (!__kmp_test_futex_lock(lck, gtid)) {
537     retval = 0;
538   } else {
539     KMP_MB();
540     retval = lck->lk.depth_locked = 1;
541   }
542   return retval;
543 }
544 
545 static int __kmp_test_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
546                                                     kmp_int32 gtid) {
547   char const *const func = "omp_test_nest_lock";
548   if (!__kmp_is_futex_lock_nestable(lck)) {
549     KMP_FATAL(LockSimpleUsedAsNestable, func);
550   }
551   return __kmp_test_nested_futex_lock(lck, gtid);
552 }
553 
554 int __kmp_release_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
555   KMP_DEBUG_ASSERT(gtid >= 0);
556 
557   KMP_MB();
558   if (--(lck->lk.depth_locked) == 0) {
559     __kmp_release_futex_lock(lck, gtid);
560     return KMP_LOCK_RELEASED;
561   }
562   return KMP_LOCK_STILL_HELD;
563 }
564 
565 static int __kmp_release_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
566                                                        kmp_int32 gtid) {
567   char const *const func = "omp_unset_nest_lock";
568   KMP_MB(); /* in case another processor initialized lock */
569   if (!__kmp_is_futex_lock_nestable(lck)) {
570     KMP_FATAL(LockSimpleUsedAsNestable, func);
571   }
572   if (__kmp_get_futex_lock_owner(lck) == -1) {
573     KMP_FATAL(LockUnsettingFree, func);
574   }
575   if (__kmp_get_futex_lock_owner(lck) != gtid) {
576     KMP_FATAL(LockUnsettingSetByAnother, func);
577   }
578   return __kmp_release_nested_futex_lock(lck, gtid);
579 }
580 
581 void __kmp_init_nested_futex_lock(kmp_futex_lock_t *lck) {
582   __kmp_init_futex_lock(lck);
583   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
584 }
585 
586 void __kmp_destroy_nested_futex_lock(kmp_futex_lock_t *lck) {
587   __kmp_destroy_futex_lock(lck);
588   lck->lk.depth_locked = 0;
589 }
590 
591 static void __kmp_destroy_nested_futex_lock_with_checks(kmp_futex_lock_t *lck) {
592   char const *const func = "omp_destroy_nest_lock";
593   if (!__kmp_is_futex_lock_nestable(lck)) {
594     KMP_FATAL(LockSimpleUsedAsNestable, func);
595   }
596   if (__kmp_get_futex_lock_owner(lck) != -1) {
597     KMP_FATAL(LockStillOwned, func);
598   }
599   __kmp_destroy_nested_futex_lock(lck);
600 }
601 
602 #endif // KMP_USE_FUTEX
603 
604 /* ------------------------------------------------------------------------ */
605 /* ticket (bakery) locks */
606 
607 static kmp_int32 __kmp_get_ticket_lock_owner(kmp_ticket_lock_t *lck) {
608   return std::atomic_load_explicit(&lck->lk.owner_id,
609                                    std::memory_order_relaxed) -
610          1;
611 }
612 
613 static inline bool __kmp_is_ticket_lock_nestable(kmp_ticket_lock_t *lck) {
614   return std::atomic_load_explicit(&lck->lk.depth_locked,
615                                    std::memory_order_relaxed) != -1;
616 }
617 
618 static kmp_uint32 __kmp_bakery_check(void *now_serving, kmp_uint32 my_ticket) {
619   return std::atomic_load_explicit((std::atomic<unsigned> *)now_serving,
620                                    std::memory_order_acquire) == my_ticket;
621 }
622 
623 __forceinline static int
624 __kmp_acquire_ticket_lock_timed_template(kmp_ticket_lock_t *lck,
625                                          kmp_int32 gtid) {
626   kmp_uint32 my_ticket = std::atomic_fetch_add_explicit(
627       &lck->lk.next_ticket, 1U, std::memory_order_relaxed);
628 
629 #ifdef USE_LOCK_PROFILE
630   if (std::atomic_load_explicit(&lck->lk.now_serving,
631                                 std::memory_order_relaxed) != my_ticket)
632     __kmp_printf("LOCK CONTENTION: %p\n", lck);
633 /* else __kmp_printf( "." );*/
634 #endif /* USE_LOCK_PROFILE */
635 
636   if (std::atomic_load_explicit(&lck->lk.now_serving,
637                                 std::memory_order_acquire) == my_ticket) {
638     return KMP_LOCK_ACQUIRED_FIRST;
639   }
640   KMP_WAIT_PTR(&lck->lk.now_serving, my_ticket, __kmp_bakery_check, lck);
641   return KMP_LOCK_ACQUIRED_FIRST;
642 }
643 
644 int __kmp_acquire_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
645   int retval = __kmp_acquire_ticket_lock_timed_template(lck, gtid);
646   return retval;
647 }
648 
649 static int __kmp_acquire_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
650                                                  kmp_int32 gtid) {
651   char const *const func = "omp_set_lock";
652 
653   if (!std::atomic_load_explicit(&lck->lk.initialized,
654                                  std::memory_order_relaxed)) {
655     KMP_FATAL(LockIsUninitialized, func);
656   }
657   if (lck->lk.self != lck) {
658     KMP_FATAL(LockIsUninitialized, func);
659   }
660   if (__kmp_is_ticket_lock_nestable(lck)) {
661     KMP_FATAL(LockNestableUsedAsSimple, func);
662   }
663   if ((gtid >= 0) && (__kmp_get_ticket_lock_owner(lck) == gtid)) {
664     KMP_FATAL(LockIsAlreadyOwned, func);
665   }
666 
667   __kmp_acquire_ticket_lock(lck, gtid);
668 
669   std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
670                              std::memory_order_relaxed);
671   return KMP_LOCK_ACQUIRED_FIRST;
672 }
673 
674 int __kmp_test_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
675   kmp_uint32 my_ticket = std::atomic_load_explicit(&lck->lk.next_ticket,
676                                                    std::memory_order_relaxed);
677 
678   if (std::atomic_load_explicit(&lck->lk.now_serving,
679                                 std::memory_order_relaxed) == my_ticket) {
680     kmp_uint32 next_ticket = my_ticket + 1;
681     if (std::atomic_compare_exchange_strong_explicit(
682             &lck->lk.next_ticket, &my_ticket, next_ticket,
683             std::memory_order_acquire, std::memory_order_acquire)) {
684       return TRUE;
685     }
686   }
687   return FALSE;
688 }
689 
690 static int __kmp_test_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
691                                               kmp_int32 gtid) {
692   char const *const func = "omp_test_lock";
693 
694   if (!std::atomic_load_explicit(&lck->lk.initialized,
695                                  std::memory_order_relaxed)) {
696     KMP_FATAL(LockIsUninitialized, func);
697   }
698   if (lck->lk.self != lck) {
699     KMP_FATAL(LockIsUninitialized, func);
700   }
701   if (__kmp_is_ticket_lock_nestable(lck)) {
702     KMP_FATAL(LockNestableUsedAsSimple, func);
703   }
704 
705   int retval = __kmp_test_ticket_lock(lck, gtid);
706 
707   if (retval) {
708     std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
709                                std::memory_order_relaxed);
710   }
711   return retval;
712 }
713 
714 int __kmp_release_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
715   kmp_uint32 distance = std::atomic_load_explicit(&lck->lk.next_ticket,
716                                                   std::memory_order_relaxed) -
717                         std::atomic_load_explicit(&lck->lk.now_serving,
718                                                   std::memory_order_relaxed);
719 
720   std::atomic_fetch_add_explicit(&lck->lk.now_serving, 1U,
721                                  std::memory_order_release);
722 
723   KMP_YIELD(distance >
724             (kmp_uint32)(__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
725   return KMP_LOCK_RELEASED;
726 }
727 
728 static int __kmp_release_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
729                                                  kmp_int32 gtid) {
730   char const *const func = "omp_unset_lock";
731 
732   if (!std::atomic_load_explicit(&lck->lk.initialized,
733                                  std::memory_order_relaxed)) {
734     KMP_FATAL(LockIsUninitialized, func);
735   }
736   if (lck->lk.self != lck) {
737     KMP_FATAL(LockIsUninitialized, func);
738   }
739   if (__kmp_is_ticket_lock_nestable(lck)) {
740     KMP_FATAL(LockNestableUsedAsSimple, func);
741   }
742   if (__kmp_get_ticket_lock_owner(lck) == -1) {
743     KMP_FATAL(LockUnsettingFree, func);
744   }
745   if ((gtid >= 0) && (__kmp_get_ticket_lock_owner(lck) >= 0) &&
746       (__kmp_get_ticket_lock_owner(lck) != gtid)) {
747     KMP_FATAL(LockUnsettingSetByAnother, func);
748   }
749   std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
750   return __kmp_release_ticket_lock(lck, gtid);
751 }
752 
753 void __kmp_init_ticket_lock(kmp_ticket_lock_t *lck) {
754   lck->lk.location = NULL;
755   lck->lk.self = lck;
756   std::atomic_store_explicit(&lck->lk.next_ticket, 0U,
757                              std::memory_order_relaxed);
758   std::atomic_store_explicit(&lck->lk.now_serving, 0U,
759                              std::memory_order_relaxed);
760   std::atomic_store_explicit(
761       &lck->lk.owner_id, 0,
762       std::memory_order_relaxed); // no thread owns the lock.
763   std::atomic_store_explicit(
764       &lck->lk.depth_locked, -1,
765       std::memory_order_relaxed); // -1 => not a nested lock.
766   std::atomic_store_explicit(&lck->lk.initialized, true,
767                              std::memory_order_release);
768 }
769 
770 void __kmp_destroy_ticket_lock(kmp_ticket_lock_t *lck) {
771   std::atomic_store_explicit(&lck->lk.initialized, false,
772                              std::memory_order_release);
773   lck->lk.self = NULL;
774   lck->lk.location = NULL;
775   std::atomic_store_explicit(&lck->lk.next_ticket, 0U,
776                              std::memory_order_relaxed);
777   std::atomic_store_explicit(&lck->lk.now_serving, 0U,
778                              std::memory_order_relaxed);
779   std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
780   std::atomic_store_explicit(&lck->lk.depth_locked, -1,
781                              std::memory_order_relaxed);
782 }
783 
784 static void __kmp_destroy_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
785   char const *const func = "omp_destroy_lock";
786 
787   if (!std::atomic_load_explicit(&lck->lk.initialized,
788                                  std::memory_order_relaxed)) {
789     KMP_FATAL(LockIsUninitialized, func);
790   }
791   if (lck->lk.self != lck) {
792     KMP_FATAL(LockIsUninitialized, func);
793   }
794   if (__kmp_is_ticket_lock_nestable(lck)) {
795     KMP_FATAL(LockNestableUsedAsSimple, func);
796   }
797   if (__kmp_get_ticket_lock_owner(lck) != -1) {
798     KMP_FATAL(LockStillOwned, func);
799   }
800   __kmp_destroy_ticket_lock(lck);
801 }
802 
803 // nested ticket locks
804 
805 int __kmp_acquire_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
806   KMP_DEBUG_ASSERT(gtid >= 0);
807 
808   if (__kmp_get_ticket_lock_owner(lck) == gtid) {
809     std::atomic_fetch_add_explicit(&lck->lk.depth_locked, 1,
810                                    std::memory_order_relaxed);
811     return KMP_LOCK_ACQUIRED_NEXT;
812   } else {
813     __kmp_acquire_ticket_lock_timed_template(lck, gtid);
814     std::atomic_store_explicit(&lck->lk.depth_locked, 1,
815                                std::memory_order_relaxed);
816     std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
817                                std::memory_order_relaxed);
818     return KMP_LOCK_ACQUIRED_FIRST;
819   }
820 }
821 
822 static int __kmp_acquire_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
823                                                         kmp_int32 gtid) {
824   char const *const func = "omp_set_nest_lock";
825 
826   if (!std::atomic_load_explicit(&lck->lk.initialized,
827                                  std::memory_order_relaxed)) {
828     KMP_FATAL(LockIsUninitialized, func);
829   }
830   if (lck->lk.self != lck) {
831     KMP_FATAL(LockIsUninitialized, func);
832   }
833   if (!__kmp_is_ticket_lock_nestable(lck)) {
834     KMP_FATAL(LockSimpleUsedAsNestable, func);
835   }
836   return __kmp_acquire_nested_ticket_lock(lck, gtid);
837 }
838 
839 int __kmp_test_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
840   int retval;
841 
842   KMP_DEBUG_ASSERT(gtid >= 0);
843 
844   if (__kmp_get_ticket_lock_owner(lck) == gtid) {
845     retval = std::atomic_fetch_add_explicit(&lck->lk.depth_locked, 1,
846                                             std::memory_order_relaxed) +
847              1;
848   } else if (!__kmp_test_ticket_lock(lck, gtid)) {
849     retval = 0;
850   } else {
851     std::atomic_store_explicit(&lck->lk.depth_locked, 1,
852                                std::memory_order_relaxed);
853     std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
854                                std::memory_order_relaxed);
855     retval = 1;
856   }
857   return retval;
858 }
859 
860 static int __kmp_test_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
861                                                      kmp_int32 gtid) {
862   char const *const func = "omp_test_nest_lock";
863 
864   if (!std::atomic_load_explicit(&lck->lk.initialized,
865                                  std::memory_order_relaxed)) {
866     KMP_FATAL(LockIsUninitialized, func);
867   }
868   if (lck->lk.self != lck) {
869     KMP_FATAL(LockIsUninitialized, func);
870   }
871   if (!__kmp_is_ticket_lock_nestable(lck)) {
872     KMP_FATAL(LockSimpleUsedAsNestable, func);
873   }
874   return __kmp_test_nested_ticket_lock(lck, gtid);
875 }
876 
877 int __kmp_release_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
878   KMP_DEBUG_ASSERT(gtid >= 0);
879 
880   if ((std::atomic_fetch_add_explicit(&lck->lk.depth_locked, -1,
881                                       std::memory_order_relaxed) -
882        1) == 0) {
883     std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
884     __kmp_release_ticket_lock(lck, gtid);
885     return KMP_LOCK_RELEASED;
886   }
887   return KMP_LOCK_STILL_HELD;
888 }
889 
890 static int __kmp_release_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
891                                                         kmp_int32 gtid) {
892   char const *const func = "omp_unset_nest_lock";
893 
894   if (!std::atomic_load_explicit(&lck->lk.initialized,
895                                  std::memory_order_relaxed)) {
896     KMP_FATAL(LockIsUninitialized, func);
897   }
898   if (lck->lk.self != lck) {
899     KMP_FATAL(LockIsUninitialized, func);
900   }
901   if (!__kmp_is_ticket_lock_nestable(lck)) {
902     KMP_FATAL(LockSimpleUsedAsNestable, func);
903   }
904   if (__kmp_get_ticket_lock_owner(lck) == -1) {
905     KMP_FATAL(LockUnsettingFree, func);
906   }
907   if (__kmp_get_ticket_lock_owner(lck) != gtid) {
908     KMP_FATAL(LockUnsettingSetByAnother, func);
909   }
910   return __kmp_release_nested_ticket_lock(lck, gtid);
911 }
912 
913 void __kmp_init_nested_ticket_lock(kmp_ticket_lock_t *lck) {
914   __kmp_init_ticket_lock(lck);
915   std::atomic_store_explicit(&lck->lk.depth_locked, 0,
916                              std::memory_order_relaxed);
917   // >= 0 for nestable locks, -1 for simple locks
918 }
919 
920 void __kmp_destroy_nested_ticket_lock(kmp_ticket_lock_t *lck) {
921   __kmp_destroy_ticket_lock(lck);
922   std::atomic_store_explicit(&lck->lk.depth_locked, 0,
923                              std::memory_order_relaxed);
924 }
925 
926 static void
927 __kmp_destroy_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
928   char const *const func = "omp_destroy_nest_lock";
929 
930   if (!std::atomic_load_explicit(&lck->lk.initialized,
931                                  std::memory_order_relaxed)) {
932     KMP_FATAL(LockIsUninitialized, func);
933   }
934   if (lck->lk.self != lck) {
935     KMP_FATAL(LockIsUninitialized, func);
936   }
937   if (!__kmp_is_ticket_lock_nestable(lck)) {
938     KMP_FATAL(LockSimpleUsedAsNestable, func);
939   }
940   if (__kmp_get_ticket_lock_owner(lck) != -1) {
941     KMP_FATAL(LockStillOwned, func);
942   }
943   __kmp_destroy_nested_ticket_lock(lck);
944 }
945 
946 // access functions to fields which don't exist for all lock kinds.
947 
948 static const ident_t *__kmp_get_ticket_lock_location(kmp_ticket_lock_t *lck) {
949   return lck->lk.location;
950 }
951 
952 static void __kmp_set_ticket_lock_location(kmp_ticket_lock_t *lck,
953                                            const ident_t *loc) {
954   lck->lk.location = loc;
955 }
956 
957 static kmp_lock_flags_t __kmp_get_ticket_lock_flags(kmp_ticket_lock_t *lck) {
958   return lck->lk.flags;
959 }
960 
961 static void __kmp_set_ticket_lock_flags(kmp_ticket_lock_t *lck,
962                                         kmp_lock_flags_t flags) {
963   lck->lk.flags = flags;
964 }
965 
966 /* ------------------------------------------------------------------------ */
967 /* queuing locks */
968 
969 /* First the states
970    (head,tail) =              0, 0  means lock is unheld, nobody on queue
971                  UINT_MAX or -1, 0  means lock is held, nobody on queue
972                               h, h  means lock held or about to transition,
973                                     1 element on queue
974                               h, t  h <> t, means lock is held or about to
975                                     transition, >1 elements on queue
976 
977    Now the transitions
978       Acquire(0,0)  = -1 ,0
979       Release(0,0)  = Error
980       Acquire(-1,0) =  h ,h    h > 0
981       Release(-1,0) =  0 ,0
982       Acquire(h,h)  =  h ,t    h > 0, t > 0, h <> t
983       Release(h,h)  = -1 ,0    h > 0
984       Acquire(h,t)  =  h ,t'   h > 0, t > 0, t' > 0, h <> t, h <> t', t <> t'
985       Release(h,t)  =  h',t    h > 0, t > 0, h <> t, h <> h', h' maybe = t
986 
987    And pictorially
988 
989            +-----+
990            | 0, 0|------- release -------> Error
991            +-----+
992              |  ^
993       acquire|  |release
994              |  |
995              |  |
996              v  |
997            +-----+
998            |-1, 0|
999            +-----+
1000              |  ^
1001       acquire|  |release
1002              |  |
1003              |  |
1004              v  |
1005            +-----+
1006            | h, h|
1007            +-----+
1008              |  ^
1009       acquire|  |release
1010              |  |
1011              |  |
1012              v  |
1013            +-----+
1014            | h, t|----- acquire, release loopback ---+
1015            +-----+                                   |
1016                 ^                                    |
1017                 |                                    |
1018                 +------------------------------------+
1019  */
1020 
1021 #ifdef DEBUG_QUEUING_LOCKS
1022 
1023 /* Stuff for circular trace buffer */
1024 #define TRACE_BUF_ELE 1024
1025 static char traces[TRACE_BUF_ELE][128] = {0};
1026 static int tc = 0;
1027 #define TRACE_LOCK(X, Y)                                                       \
1028   KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s\n", X, Y);
1029 #define TRACE_LOCK_T(X, Y, Z)                                                  \
1030   KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s%d\n", X, Y, Z);
1031 #define TRACE_LOCK_HT(X, Y, Z, Q)                                              \
1032   KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s %d,%d\n", X, Y,   \
1033                Z, Q);
1034 
1035 static void __kmp_dump_queuing_lock(kmp_info_t *this_thr, kmp_int32 gtid,
1036                                     kmp_queuing_lock_t *lck, kmp_int32 head_id,
1037                                     kmp_int32 tail_id) {
1038   kmp_int32 t, i;
1039 
1040   __kmp_printf_no_lock("\n__kmp_dump_queuing_lock: TRACE BEGINS HERE! \n");
1041 
1042   i = tc % TRACE_BUF_ELE;
1043   __kmp_printf_no_lock("%s\n", traces[i]);
1044   i = (i + 1) % TRACE_BUF_ELE;
1045   while (i != (tc % TRACE_BUF_ELE)) {
1046     __kmp_printf_no_lock("%s", traces[i]);
1047     i = (i + 1) % TRACE_BUF_ELE;
1048   }
1049   __kmp_printf_no_lock("\n");
1050 
1051   __kmp_printf_no_lock("\n__kmp_dump_queuing_lock: gtid+1:%d, spin_here:%d, "
1052                        "next_wait:%d, head_id:%d, tail_id:%d\n",
1053                        gtid + 1, this_thr->th.th_spin_here,
1054                        this_thr->th.th_next_waiting, head_id, tail_id);
1055 
1056   __kmp_printf_no_lock("\t\thead: %d ", lck->lk.head_id);
1057 
1058   if (lck->lk.head_id >= 1) {
1059     t = __kmp_threads[lck->lk.head_id - 1]->th.th_next_waiting;
1060     while (t > 0) {
1061       __kmp_printf_no_lock("-> %d ", t);
1062       t = __kmp_threads[t - 1]->th.th_next_waiting;
1063     }
1064   }
1065   __kmp_printf_no_lock(";  tail: %d ", lck->lk.tail_id);
1066   __kmp_printf_no_lock("\n\n");
1067 }
1068 
1069 #endif /* DEBUG_QUEUING_LOCKS */
1070 
1071 static kmp_int32 __kmp_get_queuing_lock_owner(kmp_queuing_lock_t *lck) {
1072   return TCR_4(lck->lk.owner_id) - 1;
1073 }
1074 
1075 static inline bool __kmp_is_queuing_lock_nestable(kmp_queuing_lock_t *lck) {
1076   return lck->lk.depth_locked != -1;
1077 }
1078 
1079 /* Acquire a lock using a the queuing lock implementation */
1080 template <bool takeTime>
1081 /* [TLW] The unused template above is left behind because of what BEB believes
1082    is a potential compiler problem with __forceinline. */
1083 __forceinline static int
1084 __kmp_acquire_queuing_lock_timed_template(kmp_queuing_lock_t *lck,
1085                                           kmp_int32 gtid) {
1086   kmp_info_t *this_thr = __kmp_thread_from_gtid(gtid);
1087   volatile kmp_int32 *head_id_p = &lck->lk.head_id;
1088   volatile kmp_int32 *tail_id_p = &lck->lk.tail_id;
1089   volatile kmp_uint32 *spin_here_p;
1090 
1091 #if OMPT_SUPPORT
1092   ompt_state_t prev_state = ompt_state_undefined;
1093 #endif
1094 
1095   KA_TRACE(1000,
1096            ("__kmp_acquire_queuing_lock: lck:%p, T#%d entering\n", lck, gtid));
1097 
1098   KMP_FSYNC_PREPARE(lck);
1099   KMP_DEBUG_ASSERT(this_thr != NULL);
1100   spin_here_p = &this_thr->th.th_spin_here;
1101 
1102 #ifdef DEBUG_QUEUING_LOCKS
1103   TRACE_LOCK(gtid + 1, "acq ent");
1104   if (*spin_here_p)
1105     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1106   if (this_thr->th.th_next_waiting != 0)
1107     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1108 #endif
1109   KMP_DEBUG_ASSERT(!*spin_here_p);
1110   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
1111 
1112   /* The following st.rel to spin_here_p needs to precede the cmpxchg.acq to
1113      head_id_p that may follow, not just in execution order, but also in
1114      visibility order. This way, when a releasing thread observes the changes to
1115      the queue by this thread, it can rightly assume that spin_here_p has
1116      already been set to TRUE, so that when it sets spin_here_p to FALSE, it is
1117      not premature.  If the releasing thread sets spin_here_p to FALSE before
1118      this thread sets it to TRUE, this thread will hang. */
1119   *spin_here_p = TRUE; /* before enqueuing to prevent race */
1120 
1121   while (1) {
1122     kmp_int32 enqueued;
1123     kmp_int32 head;
1124     kmp_int32 tail;
1125 
1126     head = *head_id_p;
1127 
1128     switch (head) {
1129 
1130     case -1: {
1131 #ifdef DEBUG_QUEUING_LOCKS
1132       tail = *tail_id_p;
1133       TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
1134 #endif
1135       tail = 0; /* to make sure next link asynchronously read is not set
1136                 accidentally; this assignment prevents us from entering the
1137                 if ( t > 0 ) condition in the enqueued case below, which is not
1138                 necessary for this state transition */
1139 
1140       /* try (-1,0)->(tid,tid) */
1141       enqueued = KMP_COMPARE_AND_STORE_ACQ64((volatile kmp_int64 *)tail_id_p,
1142                                              KMP_PACK_64(-1, 0),
1143                                              KMP_PACK_64(gtid + 1, gtid + 1));
1144 #ifdef DEBUG_QUEUING_LOCKS
1145       if (enqueued)
1146         TRACE_LOCK(gtid + 1, "acq enq: (-1,0)->(tid,tid)");
1147 #endif
1148     } break;
1149 
1150     default: {
1151       tail = *tail_id_p;
1152       KMP_DEBUG_ASSERT(tail != gtid + 1);
1153 
1154 #ifdef DEBUG_QUEUING_LOCKS
1155       TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
1156 #endif
1157 
1158       if (tail == 0) {
1159         enqueued = FALSE;
1160       } else {
1161         /* try (h,t) or (h,h)->(h,tid) */
1162         enqueued = KMP_COMPARE_AND_STORE_ACQ32(tail_id_p, tail, gtid + 1);
1163 
1164 #ifdef DEBUG_QUEUING_LOCKS
1165         if (enqueued)
1166           TRACE_LOCK(gtid + 1, "acq enq: (h,t)->(h,tid)");
1167 #endif
1168       }
1169     } break;
1170 
1171     case 0: /* empty queue */
1172     {
1173       kmp_int32 grabbed_lock;
1174 
1175 #ifdef DEBUG_QUEUING_LOCKS
1176       tail = *tail_id_p;
1177       TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
1178 #endif
1179       /* try (0,0)->(-1,0) */
1180 
1181       /* only legal transition out of head = 0 is head = -1 with no change to
1182        * tail */
1183       grabbed_lock = KMP_COMPARE_AND_STORE_ACQ32(head_id_p, 0, -1);
1184 
1185       if (grabbed_lock) {
1186 
1187         *spin_here_p = FALSE;
1188 
1189         KA_TRACE(
1190             1000,
1191             ("__kmp_acquire_queuing_lock: lck:%p, T#%d exiting: no queuing\n",
1192              lck, gtid));
1193 #ifdef DEBUG_QUEUING_LOCKS
1194         TRACE_LOCK_HT(gtid + 1, "acq exit: ", head, 0);
1195 #endif
1196 
1197 #if OMPT_SUPPORT
1198         if (ompt_enabled.enabled && prev_state != ompt_state_undefined) {
1199           /* change the state before clearing wait_id */
1200           this_thr->th.ompt_thread_info.state = prev_state;
1201           this_thr->th.ompt_thread_info.wait_id = 0;
1202         }
1203 #endif
1204 
1205         KMP_FSYNC_ACQUIRED(lck);
1206         return KMP_LOCK_ACQUIRED_FIRST; /* lock holder cannot be on queue */
1207       }
1208       enqueued = FALSE;
1209     } break;
1210     }
1211 
1212 #if OMPT_SUPPORT
1213     if (ompt_enabled.enabled && prev_state == ompt_state_undefined) {
1214       /* this thread will spin; set wait_id before entering wait state */
1215       prev_state = this_thr->th.ompt_thread_info.state;
1216       this_thr->th.ompt_thread_info.wait_id = (uint64_t)lck;
1217       this_thr->th.ompt_thread_info.state = ompt_state_wait_lock;
1218     }
1219 #endif
1220 
1221     if (enqueued) {
1222       if (tail > 0) {
1223         kmp_info_t *tail_thr = __kmp_thread_from_gtid(tail - 1);
1224         KMP_ASSERT(tail_thr != NULL);
1225         tail_thr->th.th_next_waiting = gtid + 1;
1226         /* corresponding wait for this write in release code */
1227       }
1228       KA_TRACE(1000,
1229                ("__kmp_acquire_queuing_lock: lck:%p, T#%d waiting for lock\n",
1230                 lck, gtid));
1231 
1232       KMP_MB();
1233       // ToDo: Use __kmp_wait_sleep or similar when blocktime != inf
1234       KMP_WAIT(spin_here_p, FALSE, KMP_EQ, lck);
1235       // Synchronize writes to both runtime thread structures
1236       // and writes in user code.
1237       KMP_MB();
1238 
1239 #ifdef DEBUG_QUEUING_LOCKS
1240       TRACE_LOCK(gtid + 1, "acq spin");
1241 
1242       if (this_thr->th.th_next_waiting != 0)
1243         __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1244 #endif
1245       KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
1246       KA_TRACE(1000, ("__kmp_acquire_queuing_lock: lck:%p, T#%d exiting: after "
1247                       "waiting on queue\n",
1248                       lck, gtid));
1249 
1250 #ifdef DEBUG_QUEUING_LOCKS
1251       TRACE_LOCK(gtid + 1, "acq exit 2");
1252 #endif
1253 
1254 #if OMPT_SUPPORT
1255       /* change the state before clearing wait_id */
1256       this_thr->th.ompt_thread_info.state = prev_state;
1257       this_thr->th.ompt_thread_info.wait_id = 0;
1258 #endif
1259 
1260       /* got lock, we were dequeued by the thread that released lock */
1261       return KMP_LOCK_ACQUIRED_FIRST;
1262     }
1263 
1264     /* Yield if number of threads > number of logical processors */
1265     /* ToDo: Not sure why this should only be in oversubscription case,
1266        maybe should be traditional YIELD_INIT/YIELD_WHEN loop */
1267     KMP_YIELD_OVERSUB();
1268 
1269 #ifdef DEBUG_QUEUING_LOCKS
1270     TRACE_LOCK(gtid + 1, "acq retry");
1271 #endif
1272   }
1273   KMP_ASSERT2(0, "should not get here");
1274   return KMP_LOCK_ACQUIRED_FIRST;
1275 }
1276 
1277 int __kmp_acquire_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1278   KMP_DEBUG_ASSERT(gtid >= 0);
1279 
1280   int retval = __kmp_acquire_queuing_lock_timed_template<false>(lck, gtid);
1281   return retval;
1282 }
1283 
1284 static int __kmp_acquire_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1285                                                   kmp_int32 gtid) {
1286   char const *const func = "omp_set_lock";
1287   if (lck->lk.initialized != lck) {
1288     KMP_FATAL(LockIsUninitialized, func);
1289   }
1290   if (__kmp_is_queuing_lock_nestable(lck)) {
1291     KMP_FATAL(LockNestableUsedAsSimple, func);
1292   }
1293   if (__kmp_get_queuing_lock_owner(lck) == gtid) {
1294     KMP_FATAL(LockIsAlreadyOwned, func);
1295   }
1296 
1297   __kmp_acquire_queuing_lock(lck, gtid);
1298 
1299   lck->lk.owner_id = gtid + 1;
1300   return KMP_LOCK_ACQUIRED_FIRST;
1301 }
1302 
1303 int __kmp_test_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1304   volatile kmp_int32 *head_id_p = &lck->lk.head_id;
1305   kmp_int32 head;
1306 #ifdef KMP_DEBUG
1307   kmp_info_t *this_thr;
1308 #endif
1309 
1310   KA_TRACE(1000, ("__kmp_test_queuing_lock: T#%d entering\n", gtid));
1311   KMP_DEBUG_ASSERT(gtid >= 0);
1312 #ifdef KMP_DEBUG
1313   this_thr = __kmp_thread_from_gtid(gtid);
1314   KMP_DEBUG_ASSERT(this_thr != NULL);
1315   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
1316 #endif
1317 
1318   head = *head_id_p;
1319 
1320   if (head == 0) { /* nobody on queue, nobody holding */
1321     /* try (0,0)->(-1,0) */
1322     if (KMP_COMPARE_AND_STORE_ACQ32(head_id_p, 0, -1)) {
1323       KA_TRACE(1000,
1324                ("__kmp_test_queuing_lock: T#%d exiting: holding lock\n", gtid));
1325       KMP_FSYNC_ACQUIRED(lck);
1326       return TRUE;
1327     }
1328   }
1329 
1330   KA_TRACE(1000,
1331            ("__kmp_test_queuing_lock: T#%d exiting: without lock\n", gtid));
1332   return FALSE;
1333 }
1334 
1335 static int __kmp_test_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1336                                                kmp_int32 gtid) {
1337   char const *const func = "omp_test_lock";
1338   if (lck->lk.initialized != lck) {
1339     KMP_FATAL(LockIsUninitialized, func);
1340   }
1341   if (__kmp_is_queuing_lock_nestable(lck)) {
1342     KMP_FATAL(LockNestableUsedAsSimple, func);
1343   }
1344 
1345   int retval = __kmp_test_queuing_lock(lck, gtid);
1346 
1347   if (retval) {
1348     lck->lk.owner_id = gtid + 1;
1349   }
1350   return retval;
1351 }
1352 
1353 int __kmp_release_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1354   volatile kmp_int32 *head_id_p = &lck->lk.head_id;
1355   volatile kmp_int32 *tail_id_p = &lck->lk.tail_id;
1356 
1357   KA_TRACE(1000,
1358            ("__kmp_release_queuing_lock: lck:%p, T#%d entering\n", lck, gtid));
1359   KMP_DEBUG_ASSERT(gtid >= 0);
1360 #if KMP_DEBUG || DEBUG_QUEUING_LOCKS
1361   kmp_info_t *this_thr = __kmp_thread_from_gtid(gtid);
1362 #endif
1363   KMP_DEBUG_ASSERT(this_thr != NULL);
1364 #ifdef DEBUG_QUEUING_LOCKS
1365   TRACE_LOCK(gtid + 1, "rel ent");
1366 
1367   if (this_thr->th.th_spin_here)
1368     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1369   if (this_thr->th.th_next_waiting != 0)
1370     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1371 #endif
1372   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
1373   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
1374 
1375   KMP_FSYNC_RELEASING(lck);
1376 
1377   while (1) {
1378     kmp_int32 dequeued;
1379     kmp_int32 head;
1380     kmp_int32 tail;
1381 
1382     head = *head_id_p;
1383 
1384 #ifdef DEBUG_QUEUING_LOCKS
1385     tail = *tail_id_p;
1386     TRACE_LOCK_HT(gtid + 1, "rel read: ", head, tail);
1387     if (head == 0)
1388       __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
1389 #endif
1390     KMP_DEBUG_ASSERT(head !=
1391                      0); /* holding the lock, head must be -1 or queue head */
1392 
1393     if (head == -1) { /* nobody on queue */
1394       /* try (-1,0)->(0,0) */
1395       if (KMP_COMPARE_AND_STORE_REL32(head_id_p, -1, 0)) {
1396         KA_TRACE(
1397             1000,
1398             ("__kmp_release_queuing_lock: lck:%p, T#%d exiting: queue empty\n",
1399              lck, gtid));
1400 #ifdef DEBUG_QUEUING_LOCKS
1401         TRACE_LOCK_HT(gtid + 1, "rel exit: ", 0, 0);
1402 #endif
1403 
1404 #if OMPT_SUPPORT
1405 /* nothing to do - no other thread is trying to shift blame */
1406 #endif
1407         return KMP_LOCK_RELEASED;
1408       }
1409       dequeued = FALSE;
1410     } else {
1411       KMP_MB();
1412       tail = *tail_id_p;
1413       if (head == tail) { /* only one thread on the queue */
1414 #ifdef DEBUG_QUEUING_LOCKS
1415         if (head <= 0)
1416           __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
1417 #endif
1418         KMP_DEBUG_ASSERT(head > 0);
1419 
1420         /* try (h,h)->(-1,0) */
1421         dequeued = KMP_COMPARE_AND_STORE_REL64(
1422             RCAST(volatile kmp_int64 *, tail_id_p), KMP_PACK_64(head, head),
1423             KMP_PACK_64(-1, 0));
1424 #ifdef DEBUG_QUEUING_LOCKS
1425         TRACE_LOCK(gtid + 1, "rel deq: (h,h)->(-1,0)");
1426 #endif
1427 
1428       } else {
1429         volatile kmp_int32 *waiting_id_p;
1430         kmp_info_t *head_thr = __kmp_thread_from_gtid(head - 1);
1431         KMP_DEBUG_ASSERT(head_thr != NULL);
1432         waiting_id_p = &head_thr->th.th_next_waiting;
1433 
1434 /* Does this require synchronous reads? */
1435 #ifdef DEBUG_QUEUING_LOCKS
1436         if (head <= 0 || tail <= 0)
1437           __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
1438 #endif
1439         KMP_DEBUG_ASSERT(head > 0 && tail > 0);
1440 
1441         /* try (h,t)->(h',t) or (t,t) */
1442         KMP_MB();
1443         /* make sure enqueuing thread has time to update next waiting thread
1444          * field */
1445         *head_id_p =
1446             KMP_WAIT((volatile kmp_uint32 *)waiting_id_p, 0, KMP_NEQ, NULL);
1447 #ifdef DEBUG_QUEUING_LOCKS
1448         TRACE_LOCK(gtid + 1, "rel deq: (h,t)->(h',t)");
1449 #endif
1450         dequeued = TRUE;
1451       }
1452     }
1453 
1454     if (dequeued) {
1455       kmp_info_t *head_thr = __kmp_thread_from_gtid(head - 1);
1456       KMP_DEBUG_ASSERT(head_thr != NULL);
1457 
1458 /* Does this require synchronous reads? */
1459 #ifdef DEBUG_QUEUING_LOCKS
1460       if (head <= 0 || tail <= 0)
1461         __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
1462 #endif
1463       KMP_DEBUG_ASSERT(head > 0 && tail > 0);
1464 
1465       /* For clean code only. Thread not released until next statement prevents
1466          race with acquire code. */
1467       head_thr->th.th_next_waiting = 0;
1468 #ifdef DEBUG_QUEUING_LOCKS
1469       TRACE_LOCK_T(gtid + 1, "rel nw=0 for t=", head);
1470 #endif
1471 
1472       KMP_MB();
1473       /* reset spin value */
1474       head_thr->th.th_spin_here = FALSE;
1475 
1476       KA_TRACE(1000, ("__kmp_release_queuing_lock: lck:%p, T#%d exiting: after "
1477                       "dequeuing\n",
1478                       lck, gtid));
1479 #ifdef DEBUG_QUEUING_LOCKS
1480       TRACE_LOCK(gtid + 1, "rel exit 2");
1481 #endif
1482       return KMP_LOCK_RELEASED;
1483     }
1484     /* KMP_CPU_PAUSE(); don't want to make releasing thread hold up acquiring
1485        threads */
1486 
1487 #ifdef DEBUG_QUEUING_LOCKS
1488     TRACE_LOCK(gtid + 1, "rel retry");
1489 #endif
1490 
1491   } /* while */
1492   KMP_ASSERT2(0, "should not get here");
1493   return KMP_LOCK_RELEASED;
1494 }
1495 
1496 static int __kmp_release_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1497                                                   kmp_int32 gtid) {
1498   char const *const func = "omp_unset_lock";
1499   KMP_MB(); /* in case another processor initialized lock */
1500   if (lck->lk.initialized != lck) {
1501     KMP_FATAL(LockIsUninitialized, func);
1502   }
1503   if (__kmp_is_queuing_lock_nestable(lck)) {
1504     KMP_FATAL(LockNestableUsedAsSimple, func);
1505   }
1506   if (__kmp_get_queuing_lock_owner(lck) == -1) {
1507     KMP_FATAL(LockUnsettingFree, func);
1508   }
1509   if (__kmp_get_queuing_lock_owner(lck) != gtid) {
1510     KMP_FATAL(LockUnsettingSetByAnother, func);
1511   }
1512   lck->lk.owner_id = 0;
1513   return __kmp_release_queuing_lock(lck, gtid);
1514 }
1515 
1516 void __kmp_init_queuing_lock(kmp_queuing_lock_t *lck) {
1517   lck->lk.location = NULL;
1518   lck->lk.head_id = 0;
1519   lck->lk.tail_id = 0;
1520   lck->lk.next_ticket = 0;
1521   lck->lk.now_serving = 0;
1522   lck->lk.owner_id = 0; // no thread owns the lock.
1523   lck->lk.depth_locked = -1; // >= 0 for nestable locks, -1 for simple locks.
1524   lck->lk.initialized = lck;
1525 
1526   KA_TRACE(1000, ("__kmp_init_queuing_lock: lock %p initialized\n", lck));
1527 }
1528 
1529 void __kmp_destroy_queuing_lock(kmp_queuing_lock_t *lck) {
1530   lck->lk.initialized = NULL;
1531   lck->lk.location = NULL;
1532   lck->lk.head_id = 0;
1533   lck->lk.tail_id = 0;
1534   lck->lk.next_ticket = 0;
1535   lck->lk.now_serving = 0;
1536   lck->lk.owner_id = 0;
1537   lck->lk.depth_locked = -1;
1538 }
1539 
1540 static void __kmp_destroy_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
1541   char const *const func = "omp_destroy_lock";
1542   if (lck->lk.initialized != lck) {
1543     KMP_FATAL(LockIsUninitialized, func);
1544   }
1545   if (__kmp_is_queuing_lock_nestable(lck)) {
1546     KMP_FATAL(LockNestableUsedAsSimple, func);
1547   }
1548   if (__kmp_get_queuing_lock_owner(lck) != -1) {
1549     KMP_FATAL(LockStillOwned, func);
1550   }
1551   __kmp_destroy_queuing_lock(lck);
1552 }
1553 
1554 // nested queuing locks
1555 
1556 int __kmp_acquire_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1557   KMP_DEBUG_ASSERT(gtid >= 0);
1558 
1559   if (__kmp_get_queuing_lock_owner(lck) == gtid) {
1560     lck->lk.depth_locked += 1;
1561     return KMP_LOCK_ACQUIRED_NEXT;
1562   } else {
1563     __kmp_acquire_queuing_lock_timed_template<false>(lck, gtid);
1564     KMP_MB();
1565     lck->lk.depth_locked = 1;
1566     KMP_MB();
1567     lck->lk.owner_id = gtid + 1;
1568     return KMP_LOCK_ACQUIRED_FIRST;
1569   }
1570 }
1571 
1572 static int
1573 __kmp_acquire_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1574                                               kmp_int32 gtid) {
1575   char const *const func = "omp_set_nest_lock";
1576   if (lck->lk.initialized != lck) {
1577     KMP_FATAL(LockIsUninitialized, func);
1578   }
1579   if (!__kmp_is_queuing_lock_nestable(lck)) {
1580     KMP_FATAL(LockSimpleUsedAsNestable, func);
1581   }
1582   return __kmp_acquire_nested_queuing_lock(lck, gtid);
1583 }
1584 
1585 int __kmp_test_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1586   int retval;
1587 
1588   KMP_DEBUG_ASSERT(gtid >= 0);
1589 
1590   if (__kmp_get_queuing_lock_owner(lck) == gtid) {
1591     retval = ++lck->lk.depth_locked;
1592   } else if (!__kmp_test_queuing_lock(lck, gtid)) {
1593     retval = 0;
1594   } else {
1595     KMP_MB();
1596     retval = lck->lk.depth_locked = 1;
1597     KMP_MB();
1598     lck->lk.owner_id = gtid + 1;
1599   }
1600   return retval;
1601 }
1602 
1603 static int __kmp_test_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1604                                                       kmp_int32 gtid) {
1605   char const *const func = "omp_test_nest_lock";
1606   if (lck->lk.initialized != lck) {
1607     KMP_FATAL(LockIsUninitialized, func);
1608   }
1609   if (!__kmp_is_queuing_lock_nestable(lck)) {
1610     KMP_FATAL(LockSimpleUsedAsNestable, func);
1611   }
1612   return __kmp_test_nested_queuing_lock(lck, gtid);
1613 }
1614 
1615 int __kmp_release_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1616   KMP_DEBUG_ASSERT(gtid >= 0);
1617 
1618   KMP_MB();
1619   if (--(lck->lk.depth_locked) == 0) {
1620     KMP_MB();
1621     lck->lk.owner_id = 0;
1622     __kmp_release_queuing_lock(lck, gtid);
1623     return KMP_LOCK_RELEASED;
1624   }
1625   return KMP_LOCK_STILL_HELD;
1626 }
1627 
1628 static int
1629 __kmp_release_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1630                                               kmp_int32 gtid) {
1631   char const *const func = "omp_unset_nest_lock";
1632   KMP_MB(); /* in case another processor initialized lock */
1633   if (lck->lk.initialized != lck) {
1634     KMP_FATAL(LockIsUninitialized, func);
1635   }
1636   if (!__kmp_is_queuing_lock_nestable(lck)) {
1637     KMP_FATAL(LockSimpleUsedAsNestable, func);
1638   }
1639   if (__kmp_get_queuing_lock_owner(lck) == -1) {
1640     KMP_FATAL(LockUnsettingFree, func);
1641   }
1642   if (__kmp_get_queuing_lock_owner(lck) != gtid) {
1643     KMP_FATAL(LockUnsettingSetByAnother, func);
1644   }
1645   return __kmp_release_nested_queuing_lock(lck, gtid);
1646 }
1647 
1648 void __kmp_init_nested_queuing_lock(kmp_queuing_lock_t *lck) {
1649   __kmp_init_queuing_lock(lck);
1650   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
1651 }
1652 
1653 void __kmp_destroy_nested_queuing_lock(kmp_queuing_lock_t *lck) {
1654   __kmp_destroy_queuing_lock(lck);
1655   lck->lk.depth_locked = 0;
1656 }
1657 
1658 static void
1659 __kmp_destroy_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
1660   char const *const func = "omp_destroy_nest_lock";
1661   if (lck->lk.initialized != lck) {
1662     KMP_FATAL(LockIsUninitialized, func);
1663   }
1664   if (!__kmp_is_queuing_lock_nestable(lck)) {
1665     KMP_FATAL(LockSimpleUsedAsNestable, func);
1666   }
1667   if (__kmp_get_queuing_lock_owner(lck) != -1) {
1668     KMP_FATAL(LockStillOwned, func);
1669   }
1670   __kmp_destroy_nested_queuing_lock(lck);
1671 }
1672 
1673 // access functions to fields which don't exist for all lock kinds.
1674 
1675 static const ident_t *__kmp_get_queuing_lock_location(kmp_queuing_lock_t *lck) {
1676   return lck->lk.location;
1677 }
1678 
1679 static void __kmp_set_queuing_lock_location(kmp_queuing_lock_t *lck,
1680                                             const ident_t *loc) {
1681   lck->lk.location = loc;
1682 }
1683 
1684 static kmp_lock_flags_t __kmp_get_queuing_lock_flags(kmp_queuing_lock_t *lck) {
1685   return lck->lk.flags;
1686 }
1687 
1688 static void __kmp_set_queuing_lock_flags(kmp_queuing_lock_t *lck,
1689                                          kmp_lock_flags_t flags) {
1690   lck->lk.flags = flags;
1691 }
1692 
1693 #if KMP_USE_ADAPTIVE_LOCKS
1694 
1695 /* RTM Adaptive locks */
1696 
1697 #if KMP_HAVE_RTM_INTRINSICS
1698 #include <immintrin.h>
1699 #define SOFT_ABORT_MASK (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT)
1700 
1701 #else
1702 
1703 // Values from the status register after failed speculation.
1704 #define _XBEGIN_STARTED (~0u)
1705 #define _XABORT_EXPLICIT (1 << 0)
1706 #define _XABORT_RETRY (1 << 1)
1707 #define _XABORT_CONFLICT (1 << 2)
1708 #define _XABORT_CAPACITY (1 << 3)
1709 #define _XABORT_DEBUG (1 << 4)
1710 #define _XABORT_NESTED (1 << 5)
1711 #define _XABORT_CODE(x) ((unsigned char)(((x) >> 24) & 0xFF))
1712 
1713 // Aborts for which it's worth trying again immediately
1714 #define SOFT_ABORT_MASK (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT)
1715 
1716 #define STRINGIZE_INTERNAL(arg) #arg
1717 #define STRINGIZE(arg) STRINGIZE_INTERNAL(arg)
1718 
1719 // Access to RTM instructions
1720 /*A version of XBegin which returns -1 on speculation, and the value of EAX on
1721   an abort. This is the same definition as the compiler intrinsic that will be
1722   supported at some point. */
1723 static __inline int _xbegin() {
1724   int res = -1;
1725 
1726 #if KMP_OS_WINDOWS
1727 #if KMP_ARCH_X86_64
1728   _asm {
1729         _emit 0xC7
1730         _emit 0xF8
1731         _emit 2
1732         _emit 0
1733         _emit 0
1734         _emit 0
1735         jmp   L2
1736         mov   res, eax
1737     L2:
1738   }
1739 #else /* IA32 */
1740   _asm {
1741         _emit 0xC7
1742         _emit 0xF8
1743         _emit 2
1744         _emit 0
1745         _emit 0
1746         _emit 0
1747         jmp   L2
1748         mov   res, eax
1749     L2:
1750   }
1751 #endif // KMP_ARCH_X86_64
1752 #else
1753   /* Note that %eax must be noted as killed (clobbered), because the XSR is
1754      returned in %eax(%rax) on abort.  Other register values are restored, so
1755      don't need to be killed.
1756 
1757      We must also mark 'res' as an input and an output, since otherwise
1758      'res=-1' may be dropped as being dead, whereas we do need the assignment on
1759      the successful (i.e., non-abort) path. */
1760   __asm__ volatile("1: .byte  0xC7; .byte 0xF8;\n"
1761                    "   .long  1f-1b-6\n"
1762                    "    jmp   2f\n"
1763                    "1:  movl  %%eax,%0\n"
1764                    "2:"
1765                    : "+r"(res)::"memory", "%eax");
1766 #endif // KMP_OS_WINDOWS
1767   return res;
1768 }
1769 
1770 /* Transaction end */
1771 static __inline void _xend() {
1772 #if KMP_OS_WINDOWS
1773   __asm {
1774         _emit 0x0f
1775         _emit 0x01
1776         _emit 0xd5
1777   }
1778 #else
1779   __asm__ volatile(".byte 0x0f; .byte 0x01; .byte 0xd5" ::: "memory");
1780 #endif
1781 }
1782 
1783 /* This is a macro, the argument must be a single byte constant which can be
1784    evaluated by the inline assembler, since it is emitted as a byte into the
1785    assembly code. */
1786 // clang-format off
1787 #if KMP_OS_WINDOWS
1788 #define _xabort(ARG) _asm _emit 0xc6 _asm _emit 0xf8 _asm _emit ARG
1789 #else
1790 #define _xabort(ARG)                                                           \
1791   __asm__ volatile(".byte 0xC6; .byte 0xF8; .byte " STRINGIZE(ARG):::"memory");
1792 #endif
1793 // clang-format on
1794 #endif // KMP_COMPILER_ICC && __INTEL_COMPILER >= 1300
1795 
1796 // Statistics is collected for testing purpose
1797 #if KMP_DEBUG_ADAPTIVE_LOCKS
1798 
1799 // We accumulate speculative lock statistics when the lock is destroyed. We
1800 // keep locks that haven't been destroyed in the liveLocks list so that we can
1801 // grab their statistics too.
1802 static kmp_adaptive_lock_statistics_t destroyedStats;
1803 
1804 // To hold the list of live locks.
1805 static kmp_adaptive_lock_info_t liveLocks;
1806 
1807 // A lock so we can safely update the list of locks.
1808 static kmp_bootstrap_lock_t chain_lock =
1809     KMP_BOOTSTRAP_LOCK_INITIALIZER(chain_lock);
1810 
1811 // Initialize the list of stats.
1812 void __kmp_init_speculative_stats() {
1813   kmp_adaptive_lock_info_t *lck = &liveLocks;
1814 
1815   memset(CCAST(kmp_adaptive_lock_statistics_t *, &(lck->stats)), 0,
1816          sizeof(lck->stats));
1817   lck->stats.next = lck;
1818   lck->stats.prev = lck;
1819 
1820   KMP_ASSERT(lck->stats.next->stats.prev == lck);
1821   KMP_ASSERT(lck->stats.prev->stats.next == lck);
1822 
1823   __kmp_init_bootstrap_lock(&chain_lock);
1824 }
1825 
1826 // Insert the lock into the circular list
1827 static void __kmp_remember_lock(kmp_adaptive_lock_info_t *lck) {
1828   __kmp_acquire_bootstrap_lock(&chain_lock);
1829 
1830   lck->stats.next = liveLocks.stats.next;
1831   lck->stats.prev = &liveLocks;
1832 
1833   liveLocks.stats.next = lck;
1834   lck->stats.next->stats.prev = lck;
1835 
1836   KMP_ASSERT(lck->stats.next->stats.prev == lck);
1837   KMP_ASSERT(lck->stats.prev->stats.next == lck);
1838 
1839   __kmp_release_bootstrap_lock(&chain_lock);
1840 }
1841 
1842 static void __kmp_forget_lock(kmp_adaptive_lock_info_t *lck) {
1843   KMP_ASSERT(lck->stats.next->stats.prev == lck);
1844   KMP_ASSERT(lck->stats.prev->stats.next == lck);
1845 
1846   kmp_adaptive_lock_info_t *n = lck->stats.next;
1847   kmp_adaptive_lock_info_t *p = lck->stats.prev;
1848 
1849   n->stats.prev = p;
1850   p->stats.next = n;
1851 }
1852 
1853 static void __kmp_zero_speculative_stats(kmp_adaptive_lock_info_t *lck) {
1854   memset(CCAST(kmp_adaptive_lock_statistics_t *, &lck->stats), 0,
1855          sizeof(lck->stats));
1856   __kmp_remember_lock(lck);
1857 }
1858 
1859 static void __kmp_add_stats(kmp_adaptive_lock_statistics_t *t,
1860                             kmp_adaptive_lock_info_t *lck) {
1861   kmp_adaptive_lock_statistics_t volatile *s = &lck->stats;
1862 
1863   t->nonSpeculativeAcquireAttempts += lck->acquire_attempts;
1864   t->successfulSpeculations += s->successfulSpeculations;
1865   t->hardFailedSpeculations += s->hardFailedSpeculations;
1866   t->softFailedSpeculations += s->softFailedSpeculations;
1867   t->nonSpeculativeAcquires += s->nonSpeculativeAcquires;
1868   t->lemmingYields += s->lemmingYields;
1869 }
1870 
1871 static void __kmp_accumulate_speculative_stats(kmp_adaptive_lock_info_t *lck) {
1872   __kmp_acquire_bootstrap_lock(&chain_lock);
1873 
1874   __kmp_add_stats(&destroyedStats, lck);
1875   __kmp_forget_lock(lck);
1876 
1877   __kmp_release_bootstrap_lock(&chain_lock);
1878 }
1879 
1880 static float percent(kmp_uint32 count, kmp_uint32 total) {
1881   return (total == 0) ? 0.0 : (100.0 * count) / total;
1882 }
1883 
1884 void __kmp_print_speculative_stats() {
1885   kmp_adaptive_lock_statistics_t total = destroyedStats;
1886   kmp_adaptive_lock_info_t *lck;
1887 
1888   for (lck = liveLocks.stats.next; lck != &liveLocks; lck = lck->stats.next) {
1889     __kmp_add_stats(&total, lck);
1890   }
1891   kmp_adaptive_lock_statistics_t *t = &total;
1892   kmp_uint32 totalSections =
1893       t->nonSpeculativeAcquires + t->successfulSpeculations;
1894   kmp_uint32 totalSpeculations = t->successfulSpeculations +
1895                                  t->hardFailedSpeculations +
1896                                  t->softFailedSpeculations;
1897   if (totalSections <= 0)
1898     return;
1899 
1900   kmp_safe_raii_file_t statsFile;
1901   if (strcmp(__kmp_speculative_statsfile, "-") == 0) {
1902     statsFile.set_stdout();
1903   } else {
1904     size_t buffLen = KMP_STRLEN(__kmp_speculative_statsfile) + 20;
1905     char buffer[buffLen];
1906     KMP_SNPRINTF(&buffer[0], buffLen, __kmp_speculative_statsfile,
1907                  (kmp_int32)getpid());
1908     statsFile.open(buffer, "w");
1909   }
1910 
1911   fprintf(statsFile, "Speculative lock statistics (all approximate!)\n");
1912   fprintf(statsFile,
1913           " Lock parameters: \n"
1914           "   max_soft_retries               : %10d\n"
1915           "   max_badness                    : %10d\n",
1916           __kmp_adaptive_backoff_params.max_soft_retries,
1917           __kmp_adaptive_backoff_params.max_badness);
1918   fprintf(statsFile, " Non-speculative acquire attempts : %10d\n",
1919           t->nonSpeculativeAcquireAttempts);
1920   fprintf(statsFile, " Total critical sections          : %10d\n",
1921           totalSections);
1922   fprintf(statsFile, " Successful speculations          : %10d (%5.1f%%)\n",
1923           t->successfulSpeculations,
1924           percent(t->successfulSpeculations, totalSections));
1925   fprintf(statsFile, " Non-speculative acquires         : %10d (%5.1f%%)\n",
1926           t->nonSpeculativeAcquires,
1927           percent(t->nonSpeculativeAcquires, totalSections));
1928   fprintf(statsFile, " Lemming yields                   : %10d\n\n",
1929           t->lemmingYields);
1930 
1931   fprintf(statsFile, " Speculative acquire attempts     : %10d\n",
1932           totalSpeculations);
1933   fprintf(statsFile, " Successes                        : %10d (%5.1f%%)\n",
1934           t->successfulSpeculations,
1935           percent(t->successfulSpeculations, totalSpeculations));
1936   fprintf(statsFile, " Soft failures                    : %10d (%5.1f%%)\n",
1937           t->softFailedSpeculations,
1938           percent(t->softFailedSpeculations, totalSpeculations));
1939   fprintf(statsFile, " Hard failures                    : %10d (%5.1f%%)\n",
1940           t->hardFailedSpeculations,
1941           percent(t->hardFailedSpeculations, totalSpeculations));
1942 }
1943 
1944 #define KMP_INC_STAT(lck, stat) (lck->lk.adaptive.stats.stat++)
1945 #else
1946 #define KMP_INC_STAT(lck, stat)
1947 
1948 #endif // KMP_DEBUG_ADAPTIVE_LOCKS
1949 
1950 static inline bool __kmp_is_unlocked_queuing_lock(kmp_queuing_lock_t *lck) {
1951   // It is enough to check that the head_id is zero.
1952   // We don't also need to check the tail.
1953   bool res = lck->lk.head_id == 0;
1954 
1955 // We need a fence here, since we must ensure that no memory operations
1956 // from later in this thread float above that read.
1957 #if KMP_COMPILER_ICC || KMP_COMPILER_ICX
1958   _mm_mfence();
1959 #else
1960   __sync_synchronize();
1961 #endif
1962 
1963   return res;
1964 }
1965 
1966 // Functions for manipulating the badness
1967 static __inline void
1968 __kmp_update_badness_after_success(kmp_adaptive_lock_t *lck) {
1969   // Reset the badness to zero so we eagerly try to speculate again
1970   lck->lk.adaptive.badness = 0;
1971   KMP_INC_STAT(lck, successfulSpeculations);
1972 }
1973 
1974 // Create a bit mask with one more set bit.
1975 static __inline void __kmp_step_badness(kmp_adaptive_lock_t *lck) {
1976   kmp_uint32 newBadness = (lck->lk.adaptive.badness << 1) | 1;
1977   if (newBadness > lck->lk.adaptive.max_badness) {
1978     return;
1979   } else {
1980     lck->lk.adaptive.badness = newBadness;
1981   }
1982 }
1983 
1984 // Check whether speculation should be attempted.
1985 KMP_ATTRIBUTE_TARGET_RTM
1986 static __inline int __kmp_should_speculate(kmp_adaptive_lock_t *lck,
1987                                            kmp_int32 gtid) {
1988   kmp_uint32 badness = lck->lk.adaptive.badness;
1989   kmp_uint32 attempts = lck->lk.adaptive.acquire_attempts;
1990   int res = (attempts & badness) == 0;
1991   return res;
1992 }
1993 
1994 // Attempt to acquire only the speculative lock.
1995 // Does not back off to the non-speculative lock.
1996 KMP_ATTRIBUTE_TARGET_RTM
1997 static int __kmp_test_adaptive_lock_only(kmp_adaptive_lock_t *lck,
1998                                          kmp_int32 gtid) {
1999   int retries = lck->lk.adaptive.max_soft_retries;
2000 
2001   // We don't explicitly count the start of speculation, rather we record the
2002   // results (success, hard fail, soft fail). The sum of all of those is the
2003   // total number of times we started speculation since all speculations must
2004   // end one of those ways.
2005   do {
2006     kmp_uint32 status = _xbegin();
2007     // Switch this in to disable actual speculation but exercise at least some
2008     // of the rest of the code. Useful for debugging...
2009     // kmp_uint32 status = _XABORT_NESTED;
2010 
2011     if (status == _XBEGIN_STARTED) {
2012       /* We have successfully started speculation. Check that no-one acquired
2013          the lock for real between when we last looked and now. This also gets
2014          the lock cache line into our read-set, which we need so that we'll
2015          abort if anyone later claims it for real. */
2016       if (!__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
2017         // Lock is now visibly acquired, so someone beat us to it. Abort the
2018         // transaction so we'll restart from _xbegin with the failure status.
2019         _xabort(0x01);
2020         KMP_ASSERT2(0, "should not get here");
2021       }
2022       return 1; // Lock has been acquired (speculatively)
2023     } else {
2024       // We have aborted, update the statistics
2025       if (status & SOFT_ABORT_MASK) {
2026         KMP_INC_STAT(lck, softFailedSpeculations);
2027         // and loop round to retry.
2028       } else {
2029         KMP_INC_STAT(lck, hardFailedSpeculations);
2030         // Give up if we had a hard failure.
2031         break;
2032       }
2033     }
2034   } while (retries--); // Loop while we have retries, and didn't fail hard.
2035 
2036   // Either we had a hard failure or we didn't succeed softly after
2037   // the full set of attempts, so back off the badness.
2038   __kmp_step_badness(lck);
2039   return 0;
2040 }
2041 
2042 // Attempt to acquire the speculative lock, or back off to the non-speculative
2043 // one if the speculative lock cannot be acquired.
2044 // We can succeed speculatively, non-speculatively, or fail.
2045 static int __kmp_test_adaptive_lock(kmp_adaptive_lock_t *lck, kmp_int32 gtid) {
2046   // First try to acquire the lock speculatively
2047   if (__kmp_should_speculate(lck, gtid) &&
2048       __kmp_test_adaptive_lock_only(lck, gtid))
2049     return 1;
2050 
2051   // Speculative acquisition failed, so try to acquire it non-speculatively.
2052   // Count the non-speculative acquire attempt
2053   lck->lk.adaptive.acquire_attempts++;
2054 
2055   // Use base, non-speculative lock.
2056   if (__kmp_test_queuing_lock(GET_QLK_PTR(lck), gtid)) {
2057     KMP_INC_STAT(lck, nonSpeculativeAcquires);
2058     return 1; // Lock is acquired (non-speculatively)
2059   } else {
2060     return 0; // Failed to acquire the lock, it's already visibly locked.
2061   }
2062 }
2063 
2064 static int __kmp_test_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
2065                                                 kmp_int32 gtid) {
2066   char const *const func = "omp_test_lock";
2067   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
2068     KMP_FATAL(LockIsUninitialized, func);
2069   }
2070 
2071   int retval = __kmp_test_adaptive_lock(lck, gtid);
2072 
2073   if (retval) {
2074     lck->lk.qlk.owner_id = gtid + 1;
2075   }
2076   return retval;
2077 }
2078 
2079 // Block until we can acquire a speculative, adaptive lock. We check whether we
2080 // should be trying to speculate. If we should be, we check the real lock to see
2081 // if it is free, and, if not, pause without attempting to acquire it until it
2082 // is. Then we try the speculative acquire. This means that although we suffer
2083 // from lemmings a little (because all we can't acquire the lock speculatively
2084 // until the queue of threads waiting has cleared), we don't get into a state
2085 // where we can never acquire the lock speculatively (because we force the queue
2086 // to clear by preventing new arrivals from entering the queue). This does mean
2087 // that when we're trying to break lemmings, the lock is no longer fair. However
2088 // OpenMP makes no guarantee that its locks are fair, so this isn't a real
2089 // problem.
2090 static void __kmp_acquire_adaptive_lock(kmp_adaptive_lock_t *lck,
2091                                         kmp_int32 gtid) {
2092   if (__kmp_should_speculate(lck, gtid)) {
2093     if (__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
2094       if (__kmp_test_adaptive_lock_only(lck, gtid))
2095         return;
2096       // We tried speculation and failed, so give up.
2097     } else {
2098       // We can't try speculation until the lock is free, so we pause here
2099       // (without suspending on the queueing lock, to allow it to drain, then
2100       // try again. All other threads will also see the same result for
2101       // shouldSpeculate, so will be doing the same if they try to claim the
2102       // lock from now on.
2103       while (!__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
2104         KMP_INC_STAT(lck, lemmingYields);
2105         KMP_YIELD(TRUE);
2106       }
2107 
2108       if (__kmp_test_adaptive_lock_only(lck, gtid))
2109         return;
2110     }
2111   }
2112 
2113   // Speculative acquisition failed, so acquire it non-speculatively.
2114   // Count the non-speculative acquire attempt
2115   lck->lk.adaptive.acquire_attempts++;
2116 
2117   __kmp_acquire_queuing_lock_timed_template<FALSE>(GET_QLK_PTR(lck), gtid);
2118   // We have acquired the base lock, so count that.
2119   KMP_INC_STAT(lck, nonSpeculativeAcquires);
2120 }
2121 
2122 static void __kmp_acquire_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
2123                                                     kmp_int32 gtid) {
2124   char const *const func = "omp_set_lock";
2125   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
2126     KMP_FATAL(LockIsUninitialized, func);
2127   }
2128   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) == gtid) {
2129     KMP_FATAL(LockIsAlreadyOwned, func);
2130   }
2131 
2132   __kmp_acquire_adaptive_lock(lck, gtid);
2133 
2134   lck->lk.qlk.owner_id = gtid + 1;
2135 }
2136 
2137 KMP_ATTRIBUTE_TARGET_RTM
2138 static int __kmp_release_adaptive_lock(kmp_adaptive_lock_t *lck,
2139                                        kmp_int32 gtid) {
2140   if (__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(
2141           lck))) { // If the lock doesn't look claimed we must be speculating.
2142     // (Or the user's code is buggy and they're releasing without locking;
2143     // if we had XTEST we'd be able to check that case...)
2144     _xend(); // Exit speculation
2145     __kmp_update_badness_after_success(lck);
2146   } else { // Since the lock *is* visibly locked we're not speculating,
2147     // so should use the underlying lock's release scheme.
2148     __kmp_release_queuing_lock(GET_QLK_PTR(lck), gtid);
2149   }
2150   return KMP_LOCK_RELEASED;
2151 }
2152 
2153 static int __kmp_release_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
2154                                                    kmp_int32 gtid) {
2155   char const *const func = "omp_unset_lock";
2156   KMP_MB(); /* in case another processor initialized lock */
2157   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
2158     KMP_FATAL(LockIsUninitialized, func);
2159   }
2160   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) == -1) {
2161     KMP_FATAL(LockUnsettingFree, func);
2162   }
2163   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) != gtid) {
2164     KMP_FATAL(LockUnsettingSetByAnother, func);
2165   }
2166   lck->lk.qlk.owner_id = 0;
2167   __kmp_release_adaptive_lock(lck, gtid);
2168   return KMP_LOCK_RELEASED;
2169 }
2170 
2171 static void __kmp_init_adaptive_lock(kmp_adaptive_lock_t *lck) {
2172   __kmp_init_queuing_lock(GET_QLK_PTR(lck));
2173   lck->lk.adaptive.badness = 0;
2174   lck->lk.adaptive.acquire_attempts = 0; // nonSpeculativeAcquireAttempts = 0;
2175   lck->lk.adaptive.max_soft_retries =
2176       __kmp_adaptive_backoff_params.max_soft_retries;
2177   lck->lk.adaptive.max_badness = __kmp_adaptive_backoff_params.max_badness;
2178 #if KMP_DEBUG_ADAPTIVE_LOCKS
2179   __kmp_zero_speculative_stats(&lck->lk.adaptive);
2180 #endif
2181   KA_TRACE(1000, ("__kmp_init_adaptive_lock: lock %p initialized\n", lck));
2182 }
2183 
2184 static void __kmp_destroy_adaptive_lock(kmp_adaptive_lock_t *lck) {
2185 #if KMP_DEBUG_ADAPTIVE_LOCKS
2186   __kmp_accumulate_speculative_stats(&lck->lk.adaptive);
2187 #endif
2188   __kmp_destroy_queuing_lock(GET_QLK_PTR(lck));
2189   // Nothing needed for the speculative part.
2190 }
2191 
2192 static void __kmp_destroy_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck) {
2193   char const *const func = "omp_destroy_lock";
2194   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
2195     KMP_FATAL(LockIsUninitialized, func);
2196   }
2197   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) != -1) {
2198     KMP_FATAL(LockStillOwned, func);
2199   }
2200   __kmp_destroy_adaptive_lock(lck);
2201 }
2202 
2203 #endif // KMP_USE_ADAPTIVE_LOCKS
2204 
2205 /* ------------------------------------------------------------------------ */
2206 /* DRDPA ticket locks                                                */
2207 /* "DRDPA" means Dynamically Reconfigurable Distributed Polling Area */
2208 
2209 static kmp_int32 __kmp_get_drdpa_lock_owner(kmp_drdpa_lock_t *lck) {
2210   return lck->lk.owner_id - 1;
2211 }
2212 
2213 static inline bool __kmp_is_drdpa_lock_nestable(kmp_drdpa_lock_t *lck) {
2214   return lck->lk.depth_locked != -1;
2215 }
2216 
2217 __forceinline static int
2218 __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2219   kmp_uint64 ticket = KMP_ATOMIC_INC(&lck->lk.next_ticket);
2220   kmp_uint64 mask = lck->lk.mask; // atomic load
2221   std::atomic<kmp_uint64> *polls = lck->lk.polls;
2222 
2223 #ifdef USE_LOCK_PROFILE
2224   if (polls[ticket & mask] != ticket)
2225     __kmp_printf("LOCK CONTENTION: %p\n", lck);
2226 /* else __kmp_printf( "." );*/
2227 #endif /* USE_LOCK_PROFILE */
2228 
2229   // Now spin-wait, but reload the polls pointer and mask, in case the
2230   // polling area has been reconfigured.  Unless it is reconfigured, the
2231   // reloads stay in L1 cache and are cheap.
2232   //
2233   // Keep this code in sync with KMP_WAIT, in kmp_dispatch.cpp !!!
2234   // The current implementation of KMP_WAIT doesn't allow for mask
2235   // and poll to be re-read every spin iteration.
2236   kmp_uint32 spins;
2237   kmp_uint64 time;
2238   KMP_FSYNC_PREPARE(lck);
2239   KMP_INIT_YIELD(spins);
2240   KMP_INIT_BACKOFF(time);
2241   while (polls[ticket & mask] < ticket) { // atomic load
2242     KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
2243     // Re-read the mask and the poll pointer from the lock structure.
2244     //
2245     // Make certain that "mask" is read before "polls" !!!
2246     //
2247     // If another thread picks reconfigures the polling area and updates their
2248     // values, and we get the new value of mask and the old polls pointer, we
2249     // could access memory beyond the end of the old polling area.
2250     mask = lck->lk.mask; // atomic load
2251     polls = lck->lk.polls; // atomic load
2252   }
2253 
2254   // Critical section starts here
2255   KMP_FSYNC_ACQUIRED(lck);
2256   KA_TRACE(1000, ("__kmp_acquire_drdpa_lock: ticket #%lld acquired lock %p\n",
2257                   ticket, lck));
2258   lck->lk.now_serving = ticket; // non-volatile store
2259 
2260   // Deallocate a garbage polling area if we know that we are the last
2261   // thread that could possibly access it.
2262   //
2263   // The >= check is in case __kmp_test_drdpa_lock() allocated the cleanup
2264   // ticket.
2265   if ((lck->lk.old_polls != NULL) && (ticket >= lck->lk.cleanup_ticket)) {
2266     __kmp_free(lck->lk.old_polls);
2267     lck->lk.old_polls = NULL;
2268     lck->lk.cleanup_ticket = 0;
2269   }
2270 
2271   // Check to see if we should reconfigure the polling area.
2272   // If there is still a garbage polling area to be deallocated from a
2273   // previous reconfiguration, let a later thread reconfigure it.
2274   if (lck->lk.old_polls == NULL) {
2275     bool reconfigure = false;
2276     std::atomic<kmp_uint64> *old_polls = polls;
2277     kmp_uint32 num_polls = TCR_4(lck->lk.num_polls);
2278 
2279     if (TCR_4(__kmp_nth) >
2280         (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
2281       // We are in oversubscription mode.  Contract the polling area
2282       // down to a single location, if that hasn't been done already.
2283       if (num_polls > 1) {
2284         reconfigure = true;
2285         num_polls = TCR_4(lck->lk.num_polls);
2286         mask = 0;
2287         num_polls = 1;
2288         polls = (std::atomic<kmp_uint64> *)__kmp_allocate(num_polls *
2289                                                           sizeof(*polls));
2290         polls[0] = ticket;
2291       }
2292     } else {
2293       // We are in under/fully subscribed mode.  Check the number of
2294       // threads waiting on the lock.  The size of the polling area
2295       // should be at least the number of threads waiting.
2296       kmp_uint64 num_waiting = TCR_8(lck->lk.next_ticket) - ticket - 1;
2297       if (num_waiting > num_polls) {
2298         kmp_uint32 old_num_polls = num_polls;
2299         reconfigure = true;
2300         do {
2301           mask = (mask << 1) | 1;
2302           num_polls *= 2;
2303         } while (num_polls <= num_waiting);
2304 
2305         // Allocate the new polling area, and copy the relevant portion
2306         // of the old polling area to the new area.  __kmp_allocate()
2307         // zeroes the memory it allocates, and most of the old area is
2308         // just zero padding, so we only copy the release counters.
2309         polls = (std::atomic<kmp_uint64> *)__kmp_allocate(num_polls *
2310                                                           sizeof(*polls));
2311         kmp_uint32 i;
2312         for (i = 0; i < old_num_polls; i++) {
2313           polls[i].store(old_polls[i]);
2314         }
2315       }
2316     }
2317 
2318     if (reconfigure) {
2319       // Now write the updated fields back to the lock structure.
2320       //
2321       // Make certain that "polls" is written before "mask" !!!
2322       //
2323       // If another thread picks up the new value of mask and the old polls
2324       // pointer , it could access memory beyond the end of the old polling
2325       // area.
2326       //
2327       // On x86, we need memory fences.
2328       KA_TRACE(1000, ("__kmp_acquire_drdpa_lock: ticket #%lld reconfiguring "
2329                       "lock %p to %d polls\n",
2330                       ticket, lck, num_polls));
2331 
2332       lck->lk.old_polls = old_polls;
2333       lck->lk.polls = polls; // atomic store
2334 
2335       KMP_MB();
2336 
2337       lck->lk.num_polls = num_polls;
2338       lck->lk.mask = mask; // atomic store
2339 
2340       KMP_MB();
2341 
2342       // Only after the new polling area and mask have been flushed
2343       // to main memory can we update the cleanup ticket field.
2344       //
2345       // volatile load / non-volatile store
2346       lck->lk.cleanup_ticket = lck->lk.next_ticket;
2347     }
2348   }
2349   return KMP_LOCK_ACQUIRED_FIRST;
2350 }
2351 
2352 int __kmp_acquire_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2353   int retval = __kmp_acquire_drdpa_lock_timed_template(lck, gtid);
2354   return retval;
2355 }
2356 
2357 static int __kmp_acquire_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2358                                                 kmp_int32 gtid) {
2359   char const *const func = "omp_set_lock";
2360   if (lck->lk.initialized != lck) {
2361     KMP_FATAL(LockIsUninitialized, func);
2362   }
2363   if (__kmp_is_drdpa_lock_nestable(lck)) {
2364     KMP_FATAL(LockNestableUsedAsSimple, func);
2365   }
2366   if ((gtid >= 0) && (__kmp_get_drdpa_lock_owner(lck) == gtid)) {
2367     KMP_FATAL(LockIsAlreadyOwned, func);
2368   }
2369 
2370   __kmp_acquire_drdpa_lock(lck, gtid);
2371 
2372   lck->lk.owner_id = gtid + 1;
2373   return KMP_LOCK_ACQUIRED_FIRST;
2374 }
2375 
2376 int __kmp_test_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2377   // First get a ticket, then read the polls pointer and the mask.
2378   // The polls pointer must be read before the mask!!! (See above)
2379   kmp_uint64 ticket = lck->lk.next_ticket; // atomic load
2380   std::atomic<kmp_uint64> *polls = lck->lk.polls;
2381   kmp_uint64 mask = lck->lk.mask; // atomic load
2382   if (polls[ticket & mask] == ticket) {
2383     kmp_uint64 next_ticket = ticket + 1;
2384     if (__kmp_atomic_compare_store_acq(&lck->lk.next_ticket, ticket,
2385                                        next_ticket)) {
2386       KMP_FSYNC_ACQUIRED(lck);
2387       KA_TRACE(1000, ("__kmp_test_drdpa_lock: ticket #%lld acquired lock %p\n",
2388                       ticket, lck));
2389       lck->lk.now_serving = ticket; // non-volatile store
2390 
2391       // Since no threads are waiting, there is no possibility that we would
2392       // want to reconfigure the polling area.  We might have the cleanup ticket
2393       // value (which says that it is now safe to deallocate old_polls), but
2394       // we'll let a later thread which calls __kmp_acquire_lock do that - this
2395       // routine isn't supposed to block, and we would risk blocks if we called
2396       // __kmp_free() to do the deallocation.
2397       return TRUE;
2398     }
2399   }
2400   return FALSE;
2401 }
2402 
2403 static int __kmp_test_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2404                                              kmp_int32 gtid) {
2405   char const *const func = "omp_test_lock";
2406   if (lck->lk.initialized != lck) {
2407     KMP_FATAL(LockIsUninitialized, func);
2408   }
2409   if (__kmp_is_drdpa_lock_nestable(lck)) {
2410     KMP_FATAL(LockNestableUsedAsSimple, func);
2411   }
2412 
2413   int retval = __kmp_test_drdpa_lock(lck, gtid);
2414 
2415   if (retval) {
2416     lck->lk.owner_id = gtid + 1;
2417   }
2418   return retval;
2419 }
2420 
2421 int __kmp_release_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2422   // Read the ticket value from the lock data struct, then the polls pointer and
2423   // the mask.  The polls pointer must be read before the mask!!! (See above)
2424   kmp_uint64 ticket = lck->lk.now_serving + 1; // non-atomic load
2425   std::atomic<kmp_uint64> *polls = lck->lk.polls; // atomic load
2426   kmp_uint64 mask = lck->lk.mask; // atomic load
2427   KA_TRACE(1000, ("__kmp_release_drdpa_lock: ticket #%lld released lock %p\n",
2428                   ticket - 1, lck));
2429   KMP_FSYNC_RELEASING(lck);
2430   polls[ticket & mask] = ticket; // atomic store
2431   return KMP_LOCK_RELEASED;
2432 }
2433 
2434 static int __kmp_release_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2435                                                 kmp_int32 gtid) {
2436   char const *const func = "omp_unset_lock";
2437   KMP_MB(); /* in case another processor initialized lock */
2438   if (lck->lk.initialized != lck) {
2439     KMP_FATAL(LockIsUninitialized, func);
2440   }
2441   if (__kmp_is_drdpa_lock_nestable(lck)) {
2442     KMP_FATAL(LockNestableUsedAsSimple, func);
2443   }
2444   if (__kmp_get_drdpa_lock_owner(lck) == -1) {
2445     KMP_FATAL(LockUnsettingFree, func);
2446   }
2447   if ((gtid >= 0) && (__kmp_get_drdpa_lock_owner(lck) >= 0) &&
2448       (__kmp_get_drdpa_lock_owner(lck) != gtid)) {
2449     KMP_FATAL(LockUnsettingSetByAnother, func);
2450   }
2451   lck->lk.owner_id = 0;
2452   return __kmp_release_drdpa_lock(lck, gtid);
2453 }
2454 
2455 void __kmp_init_drdpa_lock(kmp_drdpa_lock_t *lck) {
2456   lck->lk.location = NULL;
2457   lck->lk.mask = 0;
2458   lck->lk.num_polls = 1;
2459   lck->lk.polls = (std::atomic<kmp_uint64> *)__kmp_allocate(
2460       lck->lk.num_polls * sizeof(*(lck->lk.polls)));
2461   lck->lk.cleanup_ticket = 0;
2462   lck->lk.old_polls = NULL;
2463   lck->lk.next_ticket = 0;
2464   lck->lk.now_serving = 0;
2465   lck->lk.owner_id = 0; // no thread owns the lock.
2466   lck->lk.depth_locked = -1; // >= 0 for nestable locks, -1 for simple locks.
2467   lck->lk.initialized = lck;
2468 
2469   KA_TRACE(1000, ("__kmp_init_drdpa_lock: lock %p initialized\n", lck));
2470 }
2471 
2472 void __kmp_destroy_drdpa_lock(kmp_drdpa_lock_t *lck) {
2473   lck->lk.initialized = NULL;
2474   lck->lk.location = NULL;
2475   if (lck->lk.polls.load() != NULL) {
2476     __kmp_free(lck->lk.polls.load());
2477     lck->lk.polls = NULL;
2478   }
2479   if (lck->lk.old_polls != NULL) {
2480     __kmp_free(lck->lk.old_polls);
2481     lck->lk.old_polls = NULL;
2482   }
2483   lck->lk.mask = 0;
2484   lck->lk.num_polls = 0;
2485   lck->lk.cleanup_ticket = 0;
2486   lck->lk.next_ticket = 0;
2487   lck->lk.now_serving = 0;
2488   lck->lk.owner_id = 0;
2489   lck->lk.depth_locked = -1;
2490 }
2491 
2492 static void __kmp_destroy_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
2493   char const *const func = "omp_destroy_lock";
2494   if (lck->lk.initialized != lck) {
2495     KMP_FATAL(LockIsUninitialized, func);
2496   }
2497   if (__kmp_is_drdpa_lock_nestable(lck)) {
2498     KMP_FATAL(LockNestableUsedAsSimple, func);
2499   }
2500   if (__kmp_get_drdpa_lock_owner(lck) != -1) {
2501     KMP_FATAL(LockStillOwned, func);
2502   }
2503   __kmp_destroy_drdpa_lock(lck);
2504 }
2505 
2506 // nested drdpa ticket locks
2507 
2508 int __kmp_acquire_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2509   KMP_DEBUG_ASSERT(gtid >= 0);
2510 
2511   if (__kmp_get_drdpa_lock_owner(lck) == gtid) {
2512     lck->lk.depth_locked += 1;
2513     return KMP_LOCK_ACQUIRED_NEXT;
2514   } else {
2515     __kmp_acquire_drdpa_lock_timed_template(lck, gtid);
2516     KMP_MB();
2517     lck->lk.depth_locked = 1;
2518     KMP_MB();
2519     lck->lk.owner_id = gtid + 1;
2520     return KMP_LOCK_ACQUIRED_FIRST;
2521   }
2522 }
2523 
2524 static void __kmp_acquire_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2525                                                         kmp_int32 gtid) {
2526   char const *const func = "omp_set_nest_lock";
2527   if (lck->lk.initialized != lck) {
2528     KMP_FATAL(LockIsUninitialized, func);
2529   }
2530   if (!__kmp_is_drdpa_lock_nestable(lck)) {
2531     KMP_FATAL(LockSimpleUsedAsNestable, func);
2532   }
2533   __kmp_acquire_nested_drdpa_lock(lck, gtid);
2534 }
2535 
2536 int __kmp_test_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2537   int retval;
2538 
2539   KMP_DEBUG_ASSERT(gtid >= 0);
2540 
2541   if (__kmp_get_drdpa_lock_owner(lck) == gtid) {
2542     retval = ++lck->lk.depth_locked;
2543   } else if (!__kmp_test_drdpa_lock(lck, gtid)) {
2544     retval = 0;
2545   } else {
2546     KMP_MB();
2547     retval = lck->lk.depth_locked = 1;
2548     KMP_MB();
2549     lck->lk.owner_id = gtid + 1;
2550   }
2551   return retval;
2552 }
2553 
2554 static int __kmp_test_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2555                                                     kmp_int32 gtid) {
2556   char const *const func = "omp_test_nest_lock";
2557   if (lck->lk.initialized != lck) {
2558     KMP_FATAL(LockIsUninitialized, func);
2559   }
2560   if (!__kmp_is_drdpa_lock_nestable(lck)) {
2561     KMP_FATAL(LockSimpleUsedAsNestable, func);
2562   }
2563   return __kmp_test_nested_drdpa_lock(lck, gtid);
2564 }
2565 
2566 int __kmp_release_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2567   KMP_DEBUG_ASSERT(gtid >= 0);
2568 
2569   KMP_MB();
2570   if (--(lck->lk.depth_locked) == 0) {
2571     KMP_MB();
2572     lck->lk.owner_id = 0;
2573     __kmp_release_drdpa_lock(lck, gtid);
2574     return KMP_LOCK_RELEASED;
2575   }
2576   return KMP_LOCK_STILL_HELD;
2577 }
2578 
2579 static int __kmp_release_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2580                                                        kmp_int32 gtid) {
2581   char const *const func = "omp_unset_nest_lock";
2582   KMP_MB(); /* in case another processor initialized lock */
2583   if (lck->lk.initialized != lck) {
2584     KMP_FATAL(LockIsUninitialized, func);
2585   }
2586   if (!__kmp_is_drdpa_lock_nestable(lck)) {
2587     KMP_FATAL(LockSimpleUsedAsNestable, func);
2588   }
2589   if (__kmp_get_drdpa_lock_owner(lck) == -1) {
2590     KMP_FATAL(LockUnsettingFree, func);
2591   }
2592   if (__kmp_get_drdpa_lock_owner(lck) != gtid) {
2593     KMP_FATAL(LockUnsettingSetByAnother, func);
2594   }
2595   return __kmp_release_nested_drdpa_lock(lck, gtid);
2596 }
2597 
2598 void __kmp_init_nested_drdpa_lock(kmp_drdpa_lock_t *lck) {
2599   __kmp_init_drdpa_lock(lck);
2600   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
2601 }
2602 
2603 void __kmp_destroy_nested_drdpa_lock(kmp_drdpa_lock_t *lck) {
2604   __kmp_destroy_drdpa_lock(lck);
2605   lck->lk.depth_locked = 0;
2606 }
2607 
2608 static void __kmp_destroy_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
2609   char const *const func = "omp_destroy_nest_lock";
2610   if (lck->lk.initialized != lck) {
2611     KMP_FATAL(LockIsUninitialized, func);
2612   }
2613   if (!__kmp_is_drdpa_lock_nestable(lck)) {
2614     KMP_FATAL(LockSimpleUsedAsNestable, func);
2615   }
2616   if (__kmp_get_drdpa_lock_owner(lck) != -1) {
2617     KMP_FATAL(LockStillOwned, func);
2618   }
2619   __kmp_destroy_nested_drdpa_lock(lck);
2620 }
2621 
2622 // access functions to fields which don't exist for all lock kinds.
2623 
2624 static const ident_t *__kmp_get_drdpa_lock_location(kmp_drdpa_lock_t *lck) {
2625   return lck->lk.location;
2626 }
2627 
2628 static void __kmp_set_drdpa_lock_location(kmp_drdpa_lock_t *lck,
2629                                           const ident_t *loc) {
2630   lck->lk.location = loc;
2631 }
2632 
2633 static kmp_lock_flags_t __kmp_get_drdpa_lock_flags(kmp_drdpa_lock_t *lck) {
2634   return lck->lk.flags;
2635 }
2636 
2637 static void __kmp_set_drdpa_lock_flags(kmp_drdpa_lock_t *lck,
2638                                        kmp_lock_flags_t flags) {
2639   lck->lk.flags = flags;
2640 }
2641 
2642 // Time stamp counter
2643 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
2644 #define __kmp_tsc() __kmp_hardware_timestamp()
2645 // Runtime's default backoff parameters
2646 kmp_backoff_t __kmp_spin_backoff_params = {1, 4096, 100};
2647 #else
2648 // Use nanoseconds for other platforms
2649 extern kmp_uint64 __kmp_now_nsec();
2650 kmp_backoff_t __kmp_spin_backoff_params = {1, 256, 100};
2651 #define __kmp_tsc() __kmp_now_nsec()
2652 #endif
2653 
2654 // A useful predicate for dealing with timestamps that may wrap.
2655 // Is a before b? Since the timestamps may wrap, this is asking whether it's
2656 // shorter to go clockwise from a to b around the clock-face, or anti-clockwise.
2657 // Times where going clockwise is less distance than going anti-clockwise
2658 // are in the future, others are in the past. e.g. a = MAX-1, b = MAX+1 (=0),
2659 // then a > b (true) does not mean a reached b; whereas signed(a) = -2,
2660 // signed(b) = 0 captures the actual difference
2661 static inline bool before(kmp_uint64 a, kmp_uint64 b) {
2662   return ((kmp_int64)b - (kmp_int64)a) > 0;
2663 }
2664 
2665 // Truncated binary exponential backoff function
2666 void __kmp_spin_backoff(kmp_backoff_t *boff) {
2667   // We could flatten this loop, but making it a nested loop gives better result
2668   kmp_uint32 i;
2669   for (i = boff->step; i > 0; i--) {
2670     kmp_uint64 goal = __kmp_tsc() + boff->min_tick;
2671 #if KMP_HAVE_UMWAIT
2672     if (__kmp_umwait_enabled) {
2673       __kmp_tpause(0, boff->min_tick);
2674     } else {
2675 #endif
2676       do {
2677         KMP_CPU_PAUSE();
2678       } while (before(__kmp_tsc(), goal));
2679 #if KMP_HAVE_UMWAIT
2680     }
2681 #endif
2682   }
2683   boff->step = (boff->step << 1 | 1) & (boff->max_backoff - 1);
2684 }
2685 
2686 #if KMP_USE_DYNAMIC_LOCK
2687 
2688 // Direct lock initializers. It simply writes a tag to the low 8 bits of the
2689 // lock word.
2690 static void __kmp_init_direct_lock(kmp_dyna_lock_t *lck,
2691                                    kmp_dyna_lockseq_t seq) {
2692   TCW_4(((kmp_base_tas_lock_t *)lck)->poll, KMP_GET_D_TAG(seq));
2693   KA_TRACE(
2694       20,
2695       ("__kmp_init_direct_lock: initialized direct lock with type#%d\n", seq));
2696 }
2697 
2698 #if KMP_USE_TSX
2699 
2700 // HLE lock functions - imported from the testbed runtime.
2701 #define HLE_ACQUIRE ".byte 0xf2;"
2702 #define HLE_RELEASE ".byte 0xf3;"
2703 
2704 static inline kmp_uint32 swap4(kmp_uint32 volatile *p, kmp_uint32 v) {
2705   __asm__ volatile(HLE_ACQUIRE "xchg %1,%0" : "+r"(v), "+m"(*p) : : "memory");
2706   return v;
2707 }
2708 
2709 static void __kmp_destroy_hle_lock(kmp_dyna_lock_t *lck) { TCW_4(*lck, 0); }
2710 
2711 static void __kmp_destroy_hle_lock_with_checks(kmp_dyna_lock_t *lck) {
2712   TCW_4(*lck, 0);
2713 }
2714 
2715 static void __kmp_acquire_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
2716   // Use gtid for KMP_LOCK_BUSY if necessary
2717   if (swap4(lck, KMP_LOCK_BUSY(1, hle)) != KMP_LOCK_FREE(hle)) {
2718     int delay = 1;
2719     do {
2720       while (*(kmp_uint32 volatile *)lck != KMP_LOCK_FREE(hle)) {
2721         for (int i = delay; i != 0; --i)
2722           KMP_CPU_PAUSE();
2723         delay = ((delay << 1) | 1) & 7;
2724       }
2725     } while (swap4(lck, KMP_LOCK_BUSY(1, hle)) != KMP_LOCK_FREE(hle));
2726   }
2727 }
2728 
2729 static void __kmp_acquire_hle_lock_with_checks(kmp_dyna_lock_t *lck,
2730                                                kmp_int32 gtid) {
2731   __kmp_acquire_hle_lock(lck, gtid); // TODO: add checks
2732 }
2733 
2734 static int __kmp_release_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
2735   __asm__ volatile(HLE_RELEASE "movl %1,%0"
2736                    : "=m"(*lck)
2737                    : "r"(KMP_LOCK_FREE(hle))
2738                    : "memory");
2739   return KMP_LOCK_RELEASED;
2740 }
2741 
2742 static int __kmp_release_hle_lock_with_checks(kmp_dyna_lock_t *lck,
2743                                               kmp_int32 gtid) {
2744   return __kmp_release_hle_lock(lck, gtid); // TODO: add checks
2745 }
2746 
2747 static int __kmp_test_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
2748   return swap4(lck, KMP_LOCK_BUSY(1, hle)) == KMP_LOCK_FREE(hle);
2749 }
2750 
2751 static int __kmp_test_hle_lock_with_checks(kmp_dyna_lock_t *lck,
2752                                            kmp_int32 gtid) {
2753   return __kmp_test_hle_lock(lck, gtid); // TODO: add checks
2754 }
2755 
2756 static void __kmp_init_rtm_queuing_lock(kmp_queuing_lock_t *lck) {
2757   __kmp_init_queuing_lock(lck);
2758 }
2759 
2760 static void __kmp_destroy_rtm_queuing_lock(kmp_queuing_lock_t *lck) {
2761   __kmp_destroy_queuing_lock(lck);
2762 }
2763 
2764 static void
2765 __kmp_destroy_rtm_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
2766   __kmp_destroy_queuing_lock_with_checks(lck);
2767 }
2768 
2769 KMP_ATTRIBUTE_TARGET_RTM
2770 static void __kmp_acquire_rtm_queuing_lock(kmp_queuing_lock_t *lck,
2771                                            kmp_int32 gtid) {
2772   unsigned retries = 3, status;
2773   do {
2774     status = _xbegin();
2775     if (status == _XBEGIN_STARTED) {
2776       if (__kmp_is_unlocked_queuing_lock(lck))
2777         return;
2778       _xabort(0xff);
2779     }
2780     if ((status & _XABORT_EXPLICIT) && _XABORT_CODE(status) == 0xff) {
2781       // Wait until lock becomes free
2782       while (!__kmp_is_unlocked_queuing_lock(lck)) {
2783         KMP_YIELD(TRUE);
2784       }
2785     } else if (!(status & _XABORT_RETRY))
2786       break;
2787   } while (retries--);
2788 
2789   // Fall-back non-speculative lock (xchg)
2790   __kmp_acquire_queuing_lock(lck, gtid);
2791 }
2792 
2793 static void __kmp_acquire_rtm_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
2794                                                        kmp_int32 gtid) {
2795   __kmp_acquire_rtm_queuing_lock(lck, gtid);
2796 }
2797 
2798 KMP_ATTRIBUTE_TARGET_RTM
2799 static int __kmp_release_rtm_queuing_lock(kmp_queuing_lock_t *lck,
2800                                           kmp_int32 gtid) {
2801   if (__kmp_is_unlocked_queuing_lock(lck)) {
2802     // Releasing from speculation
2803     _xend();
2804   } else {
2805     // Releasing from a real lock
2806     __kmp_release_queuing_lock(lck, gtid);
2807   }
2808   return KMP_LOCK_RELEASED;
2809 }
2810 
2811 static int __kmp_release_rtm_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
2812                                                       kmp_int32 gtid) {
2813   return __kmp_release_rtm_queuing_lock(lck, gtid);
2814 }
2815 
2816 KMP_ATTRIBUTE_TARGET_RTM
2817 static int __kmp_test_rtm_queuing_lock(kmp_queuing_lock_t *lck,
2818                                        kmp_int32 gtid) {
2819   unsigned retries = 3, status;
2820   do {
2821     status = _xbegin();
2822     if (status == _XBEGIN_STARTED && __kmp_is_unlocked_queuing_lock(lck)) {
2823       return 1;
2824     }
2825     if (!(status & _XABORT_RETRY))
2826       break;
2827   } while (retries--);
2828 
2829   return __kmp_test_queuing_lock(lck, gtid);
2830 }
2831 
2832 static int __kmp_test_rtm_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
2833                                                    kmp_int32 gtid) {
2834   return __kmp_test_rtm_queuing_lock(lck, gtid);
2835 }
2836 
2837 // Reuse kmp_tas_lock_t for TSX lock which use RTM with fall-back spin lock.
2838 typedef kmp_tas_lock_t kmp_rtm_spin_lock_t;
2839 
2840 static void __kmp_destroy_rtm_spin_lock(kmp_rtm_spin_lock_t *lck) {
2841   KMP_ATOMIC_ST_REL(&lck->lk.poll, 0);
2842 }
2843 
2844 static void __kmp_destroy_rtm_spin_lock_with_checks(kmp_rtm_spin_lock_t *lck) {
2845   __kmp_destroy_rtm_spin_lock(lck);
2846 }
2847 
2848 KMP_ATTRIBUTE_TARGET_RTM
2849 static int __kmp_acquire_rtm_spin_lock(kmp_rtm_spin_lock_t *lck,
2850                                        kmp_int32 gtid) {
2851   unsigned retries = 3, status;
2852   kmp_int32 lock_free = KMP_LOCK_FREE(rtm_spin);
2853   kmp_int32 lock_busy = KMP_LOCK_BUSY(1, rtm_spin);
2854   do {
2855     status = _xbegin();
2856     if (status == _XBEGIN_STARTED) {
2857       if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == lock_free)
2858         return KMP_LOCK_ACQUIRED_FIRST;
2859       _xabort(0xff);
2860     }
2861     if ((status & _XABORT_EXPLICIT) && _XABORT_CODE(status) == 0xff) {
2862       // Wait until lock becomes free
2863       while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != lock_free) {
2864         KMP_YIELD(TRUE);
2865       }
2866     } else if (!(status & _XABORT_RETRY))
2867       break;
2868   } while (retries--);
2869 
2870   // Fall-back spin lock
2871   KMP_FSYNC_PREPARE(lck);
2872   kmp_backoff_t backoff = __kmp_spin_backoff_params;
2873   while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != lock_free ||
2874          !__kmp_atomic_compare_store_acq(&lck->lk.poll, lock_free, lock_busy)) {
2875     __kmp_spin_backoff(&backoff);
2876   }
2877   KMP_FSYNC_ACQUIRED(lck);
2878   return KMP_LOCK_ACQUIRED_FIRST;
2879 }
2880 
2881 static int __kmp_acquire_rtm_spin_lock_with_checks(kmp_rtm_spin_lock_t *lck,
2882                                                    kmp_int32 gtid) {
2883   return __kmp_acquire_rtm_spin_lock(lck, gtid);
2884 }
2885 
2886 KMP_ATTRIBUTE_TARGET_RTM
2887 static int __kmp_release_rtm_spin_lock(kmp_rtm_spin_lock_t *lck,
2888                                        kmp_int32 gtid) {
2889   if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == KMP_LOCK_FREE(rtm_spin)) {
2890     // Releasing from speculation
2891     _xend();
2892   } else {
2893     // Releasing from a real lock
2894     KMP_FSYNC_RELEASING(lck);
2895     KMP_ATOMIC_ST_REL(&lck->lk.poll, KMP_LOCK_FREE(rtm_spin));
2896   }
2897   return KMP_LOCK_RELEASED;
2898 }
2899 
2900 static int __kmp_release_rtm_spin_lock_with_checks(kmp_rtm_spin_lock_t *lck,
2901                                                    kmp_int32 gtid) {
2902   return __kmp_release_rtm_spin_lock(lck, gtid);
2903 }
2904 
2905 KMP_ATTRIBUTE_TARGET_RTM
2906 static int __kmp_test_rtm_spin_lock(kmp_rtm_spin_lock_t *lck, kmp_int32 gtid) {
2907   unsigned retries = 3, status;
2908   kmp_int32 lock_free = KMP_LOCK_FREE(rtm_spin);
2909   kmp_int32 lock_busy = KMP_LOCK_BUSY(1, rtm_spin);
2910   do {
2911     status = _xbegin();
2912     if (status == _XBEGIN_STARTED &&
2913         KMP_ATOMIC_LD_RLX(&lck->lk.poll) == lock_free) {
2914       return TRUE;
2915     }
2916     if (!(status & _XABORT_RETRY))
2917       break;
2918   } while (retries--);
2919 
2920   if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == lock_free &&
2921       __kmp_atomic_compare_store_acq(&lck->lk.poll, lock_free, lock_busy)) {
2922     KMP_FSYNC_ACQUIRED(lck);
2923     return TRUE;
2924   }
2925   return FALSE;
2926 }
2927 
2928 static int __kmp_test_rtm_spin_lock_with_checks(kmp_rtm_spin_lock_t *lck,
2929                                                 kmp_int32 gtid) {
2930   return __kmp_test_rtm_spin_lock(lck, gtid);
2931 }
2932 
2933 #endif // KMP_USE_TSX
2934 
2935 // Entry functions for indirect locks (first element of direct lock jump tables)
2936 static void __kmp_init_indirect_lock(kmp_dyna_lock_t *l,
2937                                      kmp_dyna_lockseq_t tag);
2938 static void __kmp_destroy_indirect_lock(kmp_dyna_lock_t *lock);
2939 static int __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
2940 static int __kmp_unset_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
2941 static int __kmp_test_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
2942 static int __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
2943                                                kmp_int32);
2944 static int __kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
2945                                                  kmp_int32);
2946 static int __kmp_test_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
2947                                                 kmp_int32);
2948 
2949 // Lock function definitions for the union parameter type
2950 #define KMP_FOREACH_LOCK_KIND(m, a) m(ticket, a) m(queuing, a) m(drdpa, a)
2951 
2952 #define expand1(lk, op)                                                        \
2953   static void __kmp_##op##_##lk##_##lock(kmp_user_lock_p lock) {               \
2954     __kmp_##op##_##lk##_##lock(&lock->lk);                                     \
2955   }
2956 #define expand2(lk, op)                                                        \
2957   static int __kmp_##op##_##lk##_##lock(kmp_user_lock_p lock,                  \
2958                                         kmp_int32 gtid) {                      \
2959     return __kmp_##op##_##lk##_##lock(&lock->lk, gtid);                        \
2960   }
2961 #define expand3(lk, op)                                                        \
2962   static void __kmp_set_##lk##_##lock_flags(kmp_user_lock_p lock,              \
2963                                             kmp_lock_flags_t flags) {          \
2964     __kmp_set_##lk##_lock_flags(&lock->lk, flags);                             \
2965   }
2966 #define expand4(lk, op)                                                        \
2967   static void __kmp_set_##lk##_##lock_location(kmp_user_lock_p lock,           \
2968                                                const ident_t *loc) {           \
2969     __kmp_set_##lk##_lock_location(&lock->lk, loc);                            \
2970   }
2971 
2972 KMP_FOREACH_LOCK_KIND(expand1, init)
2973 KMP_FOREACH_LOCK_KIND(expand1, init_nested)
2974 KMP_FOREACH_LOCK_KIND(expand1, destroy)
2975 KMP_FOREACH_LOCK_KIND(expand1, destroy_nested)
2976 KMP_FOREACH_LOCK_KIND(expand2, acquire)
2977 KMP_FOREACH_LOCK_KIND(expand2, acquire_nested)
2978 KMP_FOREACH_LOCK_KIND(expand2, release)
2979 KMP_FOREACH_LOCK_KIND(expand2, release_nested)
2980 KMP_FOREACH_LOCK_KIND(expand2, test)
2981 KMP_FOREACH_LOCK_KIND(expand2, test_nested)
2982 KMP_FOREACH_LOCK_KIND(expand3, )
2983 KMP_FOREACH_LOCK_KIND(expand4, )
2984 
2985 #undef expand1
2986 #undef expand2
2987 #undef expand3
2988 #undef expand4
2989 
2990 // Jump tables for the indirect lock functions
2991 // Only fill in the odd entries, that avoids the need to shift out the low bit
2992 
2993 // init functions
2994 #define expand(l, op) 0, __kmp_init_direct_lock,
2995 void (*__kmp_direct_init[])(kmp_dyna_lock_t *, kmp_dyna_lockseq_t) = {
2996     __kmp_init_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, init)};
2997 #undef expand
2998 
2999 // destroy functions
3000 #define expand(l, op) 0, (void (*)(kmp_dyna_lock_t *))__kmp_##op##_##l##_lock,
3001 static void (*direct_destroy[])(kmp_dyna_lock_t *) = {
3002     __kmp_destroy_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, destroy)};
3003 #undef expand
3004 #define expand(l, op)                                                          \
3005   0, (void (*)(kmp_dyna_lock_t *))__kmp_destroy_##l##_lock_with_checks,
3006 static void (*direct_destroy_check[])(kmp_dyna_lock_t *) = {
3007     __kmp_destroy_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, destroy)};
3008 #undef expand
3009 
3010 // set/acquire functions
3011 #define expand(l, op)                                                          \
3012   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock,
3013 static int (*direct_set[])(kmp_dyna_lock_t *, kmp_int32) = {
3014     __kmp_set_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, acquire)};
3015 #undef expand
3016 #define expand(l, op)                                                          \
3017   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock_with_checks,
3018 static int (*direct_set_check[])(kmp_dyna_lock_t *, kmp_int32) = {
3019     __kmp_set_indirect_lock_with_checks, 0,
3020     KMP_FOREACH_D_LOCK(expand, acquire)};
3021 #undef expand
3022 
3023 // unset/release and test functions
3024 #define expand(l, op)                                                          \
3025   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock,
3026 static int (*direct_unset[])(kmp_dyna_lock_t *, kmp_int32) = {
3027     __kmp_unset_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, release)};
3028 static int (*direct_test[])(kmp_dyna_lock_t *, kmp_int32) = {
3029     __kmp_test_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, test)};
3030 #undef expand
3031 #define expand(l, op)                                                          \
3032   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock_with_checks,
3033 static int (*direct_unset_check[])(kmp_dyna_lock_t *, kmp_int32) = {
3034     __kmp_unset_indirect_lock_with_checks, 0,
3035     KMP_FOREACH_D_LOCK(expand, release)};
3036 static int (*direct_test_check[])(kmp_dyna_lock_t *, kmp_int32) = {
3037     __kmp_test_indirect_lock_with_checks, 0, KMP_FOREACH_D_LOCK(expand, test)};
3038 #undef expand
3039 
3040 // Exposes only one set of jump tables (*lock or *lock_with_checks).
3041 void (**__kmp_direct_destroy)(kmp_dyna_lock_t *) = 0;
3042 int (**__kmp_direct_set)(kmp_dyna_lock_t *, kmp_int32) = 0;
3043 int (**__kmp_direct_unset)(kmp_dyna_lock_t *, kmp_int32) = 0;
3044 int (**__kmp_direct_test)(kmp_dyna_lock_t *, kmp_int32) = 0;
3045 
3046 // Jump tables for the indirect lock functions
3047 #define expand(l, op) (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock,
3048 void (*__kmp_indirect_init[])(kmp_user_lock_p) = {
3049     KMP_FOREACH_I_LOCK(expand, init)};
3050 #undef expand
3051 
3052 #define expand(l, op) (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock,
3053 static void (*indirect_destroy[])(kmp_user_lock_p) = {
3054     KMP_FOREACH_I_LOCK(expand, destroy)};
3055 #undef expand
3056 #define expand(l, op)                                                          \
3057   (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock_with_checks,
3058 static void (*indirect_destroy_check[])(kmp_user_lock_p) = {
3059     KMP_FOREACH_I_LOCK(expand, destroy)};
3060 #undef expand
3061 
3062 // set/acquire functions
3063 #define expand(l, op)                                                          \
3064   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock,
3065 static int (*indirect_set[])(kmp_user_lock_p,
3066                              kmp_int32) = {KMP_FOREACH_I_LOCK(expand, acquire)};
3067 #undef expand
3068 #define expand(l, op)                                                          \
3069   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock_with_checks,
3070 static int (*indirect_set_check[])(kmp_user_lock_p, kmp_int32) = {
3071     KMP_FOREACH_I_LOCK(expand, acquire)};
3072 #undef expand
3073 
3074 // unset/release and test functions
3075 #define expand(l, op)                                                          \
3076   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock,
3077 static int (*indirect_unset[])(kmp_user_lock_p, kmp_int32) = {
3078     KMP_FOREACH_I_LOCK(expand, release)};
3079 static int (*indirect_test[])(kmp_user_lock_p,
3080                               kmp_int32) = {KMP_FOREACH_I_LOCK(expand, test)};
3081 #undef expand
3082 #define expand(l, op)                                                          \
3083   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock_with_checks,
3084 static int (*indirect_unset_check[])(kmp_user_lock_p, kmp_int32) = {
3085     KMP_FOREACH_I_LOCK(expand, release)};
3086 static int (*indirect_test_check[])(kmp_user_lock_p, kmp_int32) = {
3087     KMP_FOREACH_I_LOCK(expand, test)};
3088 #undef expand
3089 
3090 // Exposes only one jump tables (*lock or *lock_with_checks).
3091 void (**__kmp_indirect_destroy)(kmp_user_lock_p) = 0;
3092 int (**__kmp_indirect_set)(kmp_user_lock_p, kmp_int32) = 0;
3093 int (**__kmp_indirect_unset)(kmp_user_lock_p, kmp_int32) = 0;
3094 int (**__kmp_indirect_test)(kmp_user_lock_p, kmp_int32) = 0;
3095 
3096 // Lock index table.
3097 kmp_indirect_lock_table_t __kmp_i_lock_table;
3098 
3099 // Size of indirect locks.
3100 static kmp_uint32 __kmp_indirect_lock_size[KMP_NUM_I_LOCKS] = {0};
3101 
3102 // Jump tables for lock accessor/modifier.
3103 void (*__kmp_indirect_set_location[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
3104                                                      const ident_t *) = {0};
3105 void (*__kmp_indirect_set_flags[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
3106                                                   kmp_lock_flags_t) = {0};
3107 const ident_t *(*__kmp_indirect_get_location[KMP_NUM_I_LOCKS])(
3108     kmp_user_lock_p) = {0};
3109 kmp_lock_flags_t (*__kmp_indirect_get_flags[KMP_NUM_I_LOCKS])(
3110     kmp_user_lock_p) = {0};
3111 
3112 // Use different lock pools for different lock types.
3113 static kmp_indirect_lock_t *__kmp_indirect_lock_pool[KMP_NUM_I_LOCKS] = {0};
3114 
3115 // User lock allocator for dynamically dispatched indirect locks. Every entry of
3116 // the indirect lock table holds the address and type of the allocated indirect
3117 // lock (kmp_indirect_lock_t), and the size of the table doubles when it is
3118 // full. A destroyed indirect lock object is returned to the reusable pool of
3119 // locks, unique to each lock type.
3120 kmp_indirect_lock_t *__kmp_allocate_indirect_lock(void **user_lock,
3121                                                   kmp_int32 gtid,
3122                                                   kmp_indirect_locktag_t tag) {
3123   kmp_indirect_lock_t *lck;
3124   kmp_lock_index_t idx, table_idx;
3125 
3126   __kmp_acquire_lock(&__kmp_global_lock, gtid);
3127 
3128   if (__kmp_indirect_lock_pool[tag] != NULL) {
3129     // Reuse the allocated and destroyed lock object
3130     lck = __kmp_indirect_lock_pool[tag];
3131     if (OMP_LOCK_T_SIZE < sizeof(void *))
3132       idx = lck->lock->pool.index;
3133     __kmp_indirect_lock_pool[tag] = (kmp_indirect_lock_t *)lck->lock->pool.next;
3134     KA_TRACE(20, ("__kmp_allocate_indirect_lock: reusing an existing lock %p\n",
3135                   lck));
3136   } else {
3137     kmp_uint32 row, col;
3138     kmp_indirect_lock_table_t *lock_table = &__kmp_i_lock_table;
3139     idx = 0;
3140     // Find location in list of lock tables to put new lock
3141     while (1) {
3142       table_idx = lock_table->next; // index within this table
3143       idx += lock_table->next; // global index within list of tables
3144       if (table_idx < lock_table->nrow_ptrs * KMP_I_LOCK_CHUNK) {
3145         row = table_idx / KMP_I_LOCK_CHUNK;
3146         col = table_idx % KMP_I_LOCK_CHUNK;
3147         // Allocate a new row of locks if necessary
3148         if (!lock_table->table[row]) {
3149           lock_table->table[row] = (kmp_indirect_lock_t *)__kmp_allocate(
3150               sizeof(kmp_indirect_lock_t) * KMP_I_LOCK_CHUNK);
3151         }
3152         break;
3153       }
3154       // Allocate a new lock table if necessary with double the capacity
3155       if (!lock_table->next_table) {
3156         kmp_indirect_lock_table_t *next_table =
3157             (kmp_indirect_lock_table_t *)__kmp_allocate(
3158                 sizeof(kmp_indirect_lock_table_t));
3159         next_table->table = (kmp_indirect_lock_t **)__kmp_allocate(
3160             sizeof(kmp_indirect_lock_t *) * 2 * lock_table->nrow_ptrs);
3161         next_table->nrow_ptrs = 2 * lock_table->nrow_ptrs;
3162         next_table->next = 0;
3163         next_table->next_table = nullptr;
3164         lock_table->next_table = next_table;
3165       }
3166       lock_table = lock_table->next_table;
3167       KMP_ASSERT(lock_table);
3168     }
3169     lock_table->next++;
3170 
3171     lck = &lock_table->table[row][col];
3172     // Allocate a new base lock object
3173     lck->lock = (kmp_user_lock_p)__kmp_allocate(__kmp_indirect_lock_size[tag]);
3174     KA_TRACE(20,
3175              ("__kmp_allocate_indirect_lock: allocated a new lock %p\n", lck));
3176   }
3177 
3178   __kmp_release_lock(&__kmp_global_lock, gtid);
3179 
3180   lck->type = tag;
3181 
3182   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3183     *(kmp_lock_index_t *)&(((kmp_base_tas_lock_t *)user_lock)->poll) =
3184         idx << 1; // indirect lock word must be even
3185   } else {
3186     *((kmp_indirect_lock_t **)user_lock) = lck;
3187   }
3188 
3189   return lck;
3190 }
3191 
3192 // User lock lookup for dynamically dispatched locks.
3193 static __forceinline kmp_indirect_lock_t *
3194 __kmp_lookup_indirect_lock(void **user_lock, const char *func) {
3195   if (__kmp_env_consistency_check) {
3196     kmp_indirect_lock_t *lck = NULL;
3197     if (user_lock == NULL) {
3198       KMP_FATAL(LockIsUninitialized, func);
3199     }
3200     if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3201       kmp_lock_index_t idx = KMP_EXTRACT_I_INDEX(user_lock);
3202       lck = __kmp_get_i_lock(idx);
3203     } else {
3204       lck = *((kmp_indirect_lock_t **)user_lock);
3205     }
3206     if (lck == NULL) {
3207       KMP_FATAL(LockIsUninitialized, func);
3208     }
3209     return lck;
3210   } else {
3211     if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3212       return __kmp_get_i_lock(KMP_EXTRACT_I_INDEX(user_lock));
3213     } else {
3214       return *((kmp_indirect_lock_t **)user_lock);
3215     }
3216   }
3217 }
3218 
3219 static void __kmp_init_indirect_lock(kmp_dyna_lock_t *lock,
3220                                      kmp_dyna_lockseq_t seq) {
3221 #if KMP_USE_ADAPTIVE_LOCKS
3222   if (seq == lockseq_adaptive && !__kmp_cpuinfo.flags.rtm) {
3223     KMP_WARNING(AdaptiveNotSupported, "kmp_lockseq_t", "adaptive");
3224     seq = lockseq_queuing;
3225   }
3226 #endif
3227 #if KMP_USE_TSX
3228   if (seq == lockseq_rtm_queuing && !__kmp_cpuinfo.flags.rtm) {
3229     seq = lockseq_queuing;
3230   }
3231 #endif
3232   kmp_indirect_locktag_t tag = KMP_GET_I_TAG(seq);
3233   kmp_indirect_lock_t *l =
3234       __kmp_allocate_indirect_lock((void **)lock, __kmp_entry_gtid(), tag);
3235   KMP_I_LOCK_FUNC(l, init)(l->lock);
3236   KA_TRACE(
3237       20, ("__kmp_init_indirect_lock: initialized indirect lock with type#%d\n",
3238            seq));
3239 }
3240 
3241 static void __kmp_destroy_indirect_lock(kmp_dyna_lock_t *lock) {
3242   kmp_uint32 gtid = __kmp_entry_gtid();
3243   kmp_indirect_lock_t *l =
3244       __kmp_lookup_indirect_lock((void **)lock, "omp_destroy_lock");
3245   KMP_I_LOCK_FUNC(l, destroy)(l->lock);
3246   kmp_indirect_locktag_t tag = l->type;
3247 
3248   __kmp_acquire_lock(&__kmp_global_lock, gtid);
3249 
3250   // Use the base lock's space to keep the pool chain.
3251   l->lock->pool.next = (kmp_user_lock_p)__kmp_indirect_lock_pool[tag];
3252   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3253     l->lock->pool.index = KMP_EXTRACT_I_INDEX(lock);
3254   }
3255   __kmp_indirect_lock_pool[tag] = l;
3256 
3257   __kmp_release_lock(&__kmp_global_lock, gtid);
3258 }
3259 
3260 static int __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
3261   kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
3262   return KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
3263 }
3264 
3265 static int __kmp_unset_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
3266   kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
3267   return KMP_I_LOCK_FUNC(l, unset)(l->lock, gtid);
3268 }
3269 
3270 static int __kmp_test_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
3271   kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
3272   return KMP_I_LOCK_FUNC(l, test)(l->lock, gtid);
3273 }
3274 
3275 static int __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
3276                                                kmp_int32 gtid) {
3277   kmp_indirect_lock_t *l =
3278       __kmp_lookup_indirect_lock((void **)lock, "omp_set_lock");
3279   return KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
3280 }
3281 
3282 static int __kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
3283                                                  kmp_int32 gtid) {
3284   kmp_indirect_lock_t *l =
3285       __kmp_lookup_indirect_lock((void **)lock, "omp_unset_lock");
3286   return KMP_I_LOCK_FUNC(l, unset)(l->lock, gtid);
3287 }
3288 
3289 static int __kmp_test_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
3290                                                 kmp_int32 gtid) {
3291   kmp_indirect_lock_t *l =
3292       __kmp_lookup_indirect_lock((void **)lock, "omp_test_lock");
3293   return KMP_I_LOCK_FUNC(l, test)(l->lock, gtid);
3294 }
3295 
3296 kmp_dyna_lockseq_t __kmp_user_lock_seq = lockseq_queuing;
3297 
3298 // This is used only in kmp_error.cpp when consistency checking is on.
3299 kmp_int32 __kmp_get_user_lock_owner(kmp_user_lock_p lck, kmp_uint32 seq) {
3300   switch (seq) {
3301   case lockseq_tas:
3302   case lockseq_nested_tas:
3303     return __kmp_get_tas_lock_owner((kmp_tas_lock_t *)lck);
3304 #if KMP_USE_FUTEX
3305   case lockseq_futex:
3306   case lockseq_nested_futex:
3307     return __kmp_get_futex_lock_owner((kmp_futex_lock_t *)lck);
3308 #endif
3309   case lockseq_ticket:
3310   case lockseq_nested_ticket:
3311     return __kmp_get_ticket_lock_owner((kmp_ticket_lock_t *)lck);
3312   case lockseq_queuing:
3313   case lockseq_nested_queuing:
3314 #if KMP_USE_ADAPTIVE_LOCKS
3315   case lockseq_adaptive:
3316 #endif
3317     return __kmp_get_queuing_lock_owner((kmp_queuing_lock_t *)lck);
3318   case lockseq_drdpa:
3319   case lockseq_nested_drdpa:
3320     return __kmp_get_drdpa_lock_owner((kmp_drdpa_lock_t *)lck);
3321   default:
3322     return 0;
3323   }
3324 }
3325 
3326 // Initializes data for dynamic user locks.
3327 void __kmp_init_dynamic_user_locks() {
3328   // Initialize jump table for the lock functions
3329   if (__kmp_env_consistency_check) {
3330     __kmp_direct_set = direct_set_check;
3331     __kmp_direct_unset = direct_unset_check;
3332     __kmp_direct_test = direct_test_check;
3333     __kmp_direct_destroy = direct_destroy_check;
3334     __kmp_indirect_set = indirect_set_check;
3335     __kmp_indirect_unset = indirect_unset_check;
3336     __kmp_indirect_test = indirect_test_check;
3337     __kmp_indirect_destroy = indirect_destroy_check;
3338   } else {
3339     __kmp_direct_set = direct_set;
3340     __kmp_direct_unset = direct_unset;
3341     __kmp_direct_test = direct_test;
3342     __kmp_direct_destroy = direct_destroy;
3343     __kmp_indirect_set = indirect_set;
3344     __kmp_indirect_unset = indirect_unset;
3345     __kmp_indirect_test = indirect_test;
3346     __kmp_indirect_destroy = indirect_destroy;
3347   }
3348   // If the user locks have already been initialized, then return. Allow the
3349   // switch between different KMP_CONSISTENCY_CHECK values, but do not allocate
3350   // new lock tables if they have already been allocated.
3351   if (__kmp_init_user_locks)
3352     return;
3353 
3354   // Initialize lock index table
3355   __kmp_i_lock_table.nrow_ptrs = KMP_I_LOCK_TABLE_INIT_NROW_PTRS;
3356   __kmp_i_lock_table.table = (kmp_indirect_lock_t **)__kmp_allocate(
3357       sizeof(kmp_indirect_lock_t *) * KMP_I_LOCK_TABLE_INIT_NROW_PTRS);
3358   *(__kmp_i_lock_table.table) = (kmp_indirect_lock_t *)__kmp_allocate(
3359       KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t));
3360   __kmp_i_lock_table.next = 0;
3361   __kmp_i_lock_table.next_table = nullptr;
3362 
3363   // Indirect lock size
3364   __kmp_indirect_lock_size[locktag_ticket] = sizeof(kmp_ticket_lock_t);
3365   __kmp_indirect_lock_size[locktag_queuing] = sizeof(kmp_queuing_lock_t);
3366 #if KMP_USE_ADAPTIVE_LOCKS
3367   __kmp_indirect_lock_size[locktag_adaptive] = sizeof(kmp_adaptive_lock_t);
3368 #endif
3369   __kmp_indirect_lock_size[locktag_drdpa] = sizeof(kmp_drdpa_lock_t);
3370 #if KMP_USE_TSX
3371   __kmp_indirect_lock_size[locktag_rtm_queuing] = sizeof(kmp_queuing_lock_t);
3372 #endif
3373   __kmp_indirect_lock_size[locktag_nested_tas] = sizeof(kmp_tas_lock_t);
3374 #if KMP_USE_FUTEX
3375   __kmp_indirect_lock_size[locktag_nested_futex] = sizeof(kmp_futex_lock_t);
3376 #endif
3377   __kmp_indirect_lock_size[locktag_nested_ticket] = sizeof(kmp_ticket_lock_t);
3378   __kmp_indirect_lock_size[locktag_nested_queuing] = sizeof(kmp_queuing_lock_t);
3379   __kmp_indirect_lock_size[locktag_nested_drdpa] = sizeof(kmp_drdpa_lock_t);
3380 
3381 // Initialize lock accessor/modifier
3382 #define fill_jumps(table, expand, sep)                                         \
3383   {                                                                            \
3384     table[locktag##sep##ticket] = expand(ticket);                              \
3385     table[locktag##sep##queuing] = expand(queuing);                            \
3386     table[locktag##sep##drdpa] = expand(drdpa);                                \
3387   }
3388 
3389 #if KMP_USE_ADAPTIVE_LOCKS
3390 #define fill_table(table, expand)                                              \
3391   {                                                                            \
3392     fill_jumps(table, expand, _);                                              \
3393     table[locktag_adaptive] = expand(queuing);                                 \
3394     fill_jumps(table, expand, _nested_);                                       \
3395   }
3396 #else
3397 #define fill_table(table, expand)                                              \
3398   {                                                                            \
3399     fill_jumps(table, expand, _);                                              \
3400     fill_jumps(table, expand, _nested_);                                       \
3401   }
3402 #endif // KMP_USE_ADAPTIVE_LOCKS
3403 
3404 #define expand(l)                                                              \
3405   (void (*)(kmp_user_lock_p, const ident_t *)) __kmp_set_##l##_lock_location
3406   fill_table(__kmp_indirect_set_location, expand);
3407 #undef expand
3408 #define expand(l)                                                              \
3409   (void (*)(kmp_user_lock_p, kmp_lock_flags_t)) __kmp_set_##l##_lock_flags
3410   fill_table(__kmp_indirect_set_flags, expand);
3411 #undef expand
3412 #define expand(l)                                                              \
3413   (const ident_t *(*)(kmp_user_lock_p)) __kmp_get_##l##_lock_location
3414   fill_table(__kmp_indirect_get_location, expand);
3415 #undef expand
3416 #define expand(l)                                                              \
3417   (kmp_lock_flags_t(*)(kmp_user_lock_p)) __kmp_get_##l##_lock_flags
3418   fill_table(__kmp_indirect_get_flags, expand);
3419 #undef expand
3420 
3421   __kmp_init_user_locks = TRUE;
3422 }
3423 
3424 // Clean up the lock table.
3425 void __kmp_cleanup_indirect_user_locks() {
3426   int k;
3427 
3428   // Clean up locks in the pools first (they were already destroyed before going
3429   // into the pools).
3430   for (k = 0; k < KMP_NUM_I_LOCKS; ++k) {
3431     kmp_indirect_lock_t *l = __kmp_indirect_lock_pool[k];
3432     while (l != NULL) {
3433       kmp_indirect_lock_t *ll = l;
3434       l = (kmp_indirect_lock_t *)l->lock->pool.next;
3435       KA_TRACE(20, ("__kmp_cleanup_indirect_user_locks: freeing %p from pool\n",
3436                     ll));
3437       __kmp_free(ll->lock);
3438       ll->lock = NULL;
3439     }
3440     __kmp_indirect_lock_pool[k] = NULL;
3441   }
3442   // Clean up the remaining undestroyed locks.
3443   kmp_indirect_lock_table_t *ptr = &__kmp_i_lock_table;
3444   while (ptr) {
3445     for (kmp_uint32 row = 0; row < ptr->nrow_ptrs; ++row) {
3446       if (!ptr->table[row])
3447         continue;
3448       for (kmp_uint32 col = 0; col < KMP_I_LOCK_CHUNK; ++col) {
3449         kmp_indirect_lock_t *l = &ptr->table[row][col];
3450         if (l->lock) {
3451           // Locks not destroyed explicitly need to be destroyed here.
3452           KMP_I_LOCK_FUNC(l, destroy)(l->lock);
3453           KA_TRACE(20, ("__kmp_cleanup_indirect_user_locks: destroy/freeing %p "
3454                         "from table\n",
3455                         l));
3456           __kmp_free(l->lock);
3457         }
3458       }
3459       __kmp_free(ptr->table[row]);
3460     }
3461     kmp_indirect_lock_table_t *next_table = ptr->next_table;
3462     if (ptr != &__kmp_i_lock_table)
3463       __kmp_free(ptr);
3464     ptr = next_table;
3465   }
3466 
3467   __kmp_init_user_locks = FALSE;
3468 }
3469 
3470 enum kmp_lock_kind __kmp_user_lock_kind = lk_default;
3471 int __kmp_num_locks_in_block = 1; // FIXME - tune this value
3472 
3473 #else // KMP_USE_DYNAMIC_LOCK
3474 
3475 static void __kmp_init_tas_lock_with_checks(kmp_tas_lock_t *lck) {
3476   __kmp_init_tas_lock(lck);
3477 }
3478 
3479 static void __kmp_init_nested_tas_lock_with_checks(kmp_tas_lock_t *lck) {
3480   __kmp_init_nested_tas_lock(lck);
3481 }
3482 
3483 #if KMP_USE_FUTEX
3484 static void __kmp_init_futex_lock_with_checks(kmp_futex_lock_t *lck) {
3485   __kmp_init_futex_lock(lck);
3486 }
3487 
3488 static void __kmp_init_nested_futex_lock_with_checks(kmp_futex_lock_t *lck) {
3489   __kmp_init_nested_futex_lock(lck);
3490 }
3491 #endif
3492 
3493 static int __kmp_is_ticket_lock_initialized(kmp_ticket_lock_t *lck) {
3494   return lck == lck->lk.self;
3495 }
3496 
3497 static void __kmp_init_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
3498   __kmp_init_ticket_lock(lck);
3499 }
3500 
3501 static void __kmp_init_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
3502   __kmp_init_nested_ticket_lock(lck);
3503 }
3504 
3505 static int __kmp_is_queuing_lock_initialized(kmp_queuing_lock_t *lck) {
3506   return lck == lck->lk.initialized;
3507 }
3508 
3509 static void __kmp_init_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
3510   __kmp_init_queuing_lock(lck);
3511 }
3512 
3513 static void
3514 __kmp_init_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
3515   __kmp_init_nested_queuing_lock(lck);
3516 }
3517 
3518 #if KMP_USE_ADAPTIVE_LOCKS
3519 static void __kmp_init_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck) {
3520   __kmp_init_adaptive_lock(lck);
3521 }
3522 #endif
3523 
3524 static int __kmp_is_drdpa_lock_initialized(kmp_drdpa_lock_t *lck) {
3525   return lck == lck->lk.initialized;
3526 }
3527 
3528 static void __kmp_init_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
3529   __kmp_init_drdpa_lock(lck);
3530 }
3531 
3532 static void __kmp_init_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
3533   __kmp_init_nested_drdpa_lock(lck);
3534 }
3535 
3536 /* user locks
3537  * They are implemented as a table of function pointers which are set to the
3538  * lock functions of the appropriate kind, once that has been determined. */
3539 
3540 enum kmp_lock_kind __kmp_user_lock_kind = lk_default;
3541 
3542 size_t __kmp_base_user_lock_size = 0;
3543 size_t __kmp_user_lock_size = 0;
3544 
3545 kmp_int32 (*__kmp_get_user_lock_owner_)(kmp_user_lock_p lck) = NULL;
3546 int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
3547                                             kmp_int32 gtid) = NULL;
3548 
3549 int (*__kmp_test_user_lock_with_checks_)(kmp_user_lock_p lck,
3550                                          kmp_int32 gtid) = NULL;
3551 int (*__kmp_release_user_lock_with_checks_)(kmp_user_lock_p lck,
3552                                             kmp_int32 gtid) = NULL;
3553 void (*__kmp_init_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
3554 void (*__kmp_destroy_user_lock_)(kmp_user_lock_p lck) = NULL;
3555 void (*__kmp_destroy_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
3556 int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
3557                                                    kmp_int32 gtid) = NULL;
3558 
3559 int (*__kmp_test_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
3560                                                 kmp_int32 gtid) = NULL;
3561 int (*__kmp_release_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
3562                                                    kmp_int32 gtid) = NULL;
3563 void (*__kmp_init_nested_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
3564 void (*__kmp_destroy_nested_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
3565 
3566 int (*__kmp_is_user_lock_initialized_)(kmp_user_lock_p lck) = NULL;
3567 const ident_t *(*__kmp_get_user_lock_location_)(kmp_user_lock_p lck) = NULL;
3568 void (*__kmp_set_user_lock_location_)(kmp_user_lock_p lck,
3569                                       const ident_t *loc) = NULL;
3570 kmp_lock_flags_t (*__kmp_get_user_lock_flags_)(kmp_user_lock_p lck) = NULL;
3571 void (*__kmp_set_user_lock_flags_)(kmp_user_lock_p lck,
3572                                    kmp_lock_flags_t flags) = NULL;
3573 
3574 void __kmp_set_user_lock_vptrs(kmp_lock_kind_t user_lock_kind) {
3575   switch (user_lock_kind) {
3576   case lk_default:
3577   default:
3578     KMP_ASSERT(0);
3579 
3580   case lk_tas: {
3581     __kmp_base_user_lock_size = sizeof(kmp_base_tas_lock_t);
3582     __kmp_user_lock_size = sizeof(kmp_tas_lock_t);
3583 
3584     __kmp_get_user_lock_owner_ =
3585         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_tas_lock_owner);
3586 
3587     if (__kmp_env_consistency_check) {
3588       KMP_BIND_USER_LOCK_WITH_CHECKS(tas);
3589       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(tas);
3590     } else {
3591       KMP_BIND_USER_LOCK(tas);
3592       KMP_BIND_NESTED_USER_LOCK(tas);
3593     }
3594 
3595     __kmp_destroy_user_lock_ =
3596         (void (*)(kmp_user_lock_p))(&__kmp_destroy_tas_lock);
3597 
3598     __kmp_is_user_lock_initialized_ = (int (*)(kmp_user_lock_p))NULL;
3599 
3600     __kmp_get_user_lock_location_ = (const ident_t *(*)(kmp_user_lock_p))NULL;
3601 
3602     __kmp_set_user_lock_location_ =
3603         (void (*)(kmp_user_lock_p, const ident_t *))NULL;
3604 
3605     __kmp_get_user_lock_flags_ = (kmp_lock_flags_t(*)(kmp_user_lock_p))NULL;
3606 
3607     __kmp_set_user_lock_flags_ =
3608         (void (*)(kmp_user_lock_p, kmp_lock_flags_t))NULL;
3609   } break;
3610 
3611 #if KMP_USE_FUTEX
3612 
3613   case lk_futex: {
3614     __kmp_base_user_lock_size = sizeof(kmp_base_futex_lock_t);
3615     __kmp_user_lock_size = sizeof(kmp_futex_lock_t);
3616 
3617     __kmp_get_user_lock_owner_ =
3618         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_futex_lock_owner);
3619 
3620     if (__kmp_env_consistency_check) {
3621       KMP_BIND_USER_LOCK_WITH_CHECKS(futex);
3622       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(futex);
3623     } else {
3624       KMP_BIND_USER_LOCK(futex);
3625       KMP_BIND_NESTED_USER_LOCK(futex);
3626     }
3627 
3628     __kmp_destroy_user_lock_ =
3629         (void (*)(kmp_user_lock_p))(&__kmp_destroy_futex_lock);
3630 
3631     __kmp_is_user_lock_initialized_ = (int (*)(kmp_user_lock_p))NULL;
3632 
3633     __kmp_get_user_lock_location_ = (const ident_t *(*)(kmp_user_lock_p))NULL;
3634 
3635     __kmp_set_user_lock_location_ =
3636         (void (*)(kmp_user_lock_p, const ident_t *))NULL;
3637 
3638     __kmp_get_user_lock_flags_ = (kmp_lock_flags_t(*)(kmp_user_lock_p))NULL;
3639 
3640     __kmp_set_user_lock_flags_ =
3641         (void (*)(kmp_user_lock_p, kmp_lock_flags_t))NULL;
3642   } break;
3643 
3644 #endif // KMP_USE_FUTEX
3645 
3646   case lk_ticket: {
3647     __kmp_base_user_lock_size = sizeof(kmp_base_ticket_lock_t);
3648     __kmp_user_lock_size = sizeof(kmp_ticket_lock_t);
3649 
3650     __kmp_get_user_lock_owner_ =
3651         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_owner);
3652 
3653     if (__kmp_env_consistency_check) {
3654       KMP_BIND_USER_LOCK_WITH_CHECKS(ticket);
3655       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(ticket);
3656     } else {
3657       KMP_BIND_USER_LOCK(ticket);
3658       KMP_BIND_NESTED_USER_LOCK(ticket);
3659     }
3660 
3661     __kmp_destroy_user_lock_ =
3662         (void (*)(kmp_user_lock_p))(&__kmp_destroy_ticket_lock);
3663 
3664     __kmp_is_user_lock_initialized_ =
3665         (int (*)(kmp_user_lock_p))(&__kmp_is_ticket_lock_initialized);
3666 
3667     __kmp_get_user_lock_location_ =
3668         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_location);
3669 
3670     __kmp_set_user_lock_location_ = (void (*)(
3671         kmp_user_lock_p, const ident_t *))(&__kmp_set_ticket_lock_location);
3672 
3673     __kmp_get_user_lock_flags_ =
3674         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_flags);
3675 
3676     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
3677         &__kmp_set_ticket_lock_flags);
3678   } break;
3679 
3680   case lk_queuing: {
3681     __kmp_base_user_lock_size = sizeof(kmp_base_queuing_lock_t);
3682     __kmp_user_lock_size = sizeof(kmp_queuing_lock_t);
3683 
3684     __kmp_get_user_lock_owner_ =
3685         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_owner);
3686 
3687     if (__kmp_env_consistency_check) {
3688       KMP_BIND_USER_LOCK_WITH_CHECKS(queuing);
3689       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(queuing);
3690     } else {
3691       KMP_BIND_USER_LOCK(queuing);
3692       KMP_BIND_NESTED_USER_LOCK(queuing);
3693     }
3694 
3695     __kmp_destroy_user_lock_ =
3696         (void (*)(kmp_user_lock_p))(&__kmp_destroy_queuing_lock);
3697 
3698     __kmp_is_user_lock_initialized_ =
3699         (int (*)(kmp_user_lock_p))(&__kmp_is_queuing_lock_initialized);
3700 
3701     __kmp_get_user_lock_location_ =
3702         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_location);
3703 
3704     __kmp_set_user_lock_location_ = (void (*)(
3705         kmp_user_lock_p, const ident_t *))(&__kmp_set_queuing_lock_location);
3706 
3707     __kmp_get_user_lock_flags_ =
3708         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_flags);
3709 
3710     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
3711         &__kmp_set_queuing_lock_flags);
3712   } break;
3713 
3714 #if KMP_USE_ADAPTIVE_LOCKS
3715   case lk_adaptive: {
3716     __kmp_base_user_lock_size = sizeof(kmp_base_adaptive_lock_t);
3717     __kmp_user_lock_size = sizeof(kmp_adaptive_lock_t);
3718 
3719     __kmp_get_user_lock_owner_ =
3720         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_owner);
3721 
3722     if (__kmp_env_consistency_check) {
3723       KMP_BIND_USER_LOCK_WITH_CHECKS(adaptive);
3724     } else {
3725       KMP_BIND_USER_LOCK(adaptive);
3726     }
3727 
3728     __kmp_destroy_user_lock_ =
3729         (void (*)(kmp_user_lock_p))(&__kmp_destroy_adaptive_lock);
3730 
3731     __kmp_is_user_lock_initialized_ =
3732         (int (*)(kmp_user_lock_p))(&__kmp_is_queuing_lock_initialized);
3733 
3734     __kmp_get_user_lock_location_ =
3735         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_location);
3736 
3737     __kmp_set_user_lock_location_ = (void (*)(
3738         kmp_user_lock_p, const ident_t *))(&__kmp_set_queuing_lock_location);
3739 
3740     __kmp_get_user_lock_flags_ =
3741         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_flags);
3742 
3743     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
3744         &__kmp_set_queuing_lock_flags);
3745 
3746   } break;
3747 #endif // KMP_USE_ADAPTIVE_LOCKS
3748 
3749   case lk_drdpa: {
3750     __kmp_base_user_lock_size = sizeof(kmp_base_drdpa_lock_t);
3751     __kmp_user_lock_size = sizeof(kmp_drdpa_lock_t);
3752 
3753     __kmp_get_user_lock_owner_ =
3754         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_owner);
3755 
3756     if (__kmp_env_consistency_check) {
3757       KMP_BIND_USER_LOCK_WITH_CHECKS(drdpa);
3758       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(drdpa);
3759     } else {
3760       KMP_BIND_USER_LOCK(drdpa);
3761       KMP_BIND_NESTED_USER_LOCK(drdpa);
3762     }
3763 
3764     __kmp_destroy_user_lock_ =
3765         (void (*)(kmp_user_lock_p))(&__kmp_destroy_drdpa_lock);
3766 
3767     __kmp_is_user_lock_initialized_ =
3768         (int (*)(kmp_user_lock_p))(&__kmp_is_drdpa_lock_initialized);
3769 
3770     __kmp_get_user_lock_location_ =
3771         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_location);
3772 
3773     __kmp_set_user_lock_location_ = (void (*)(
3774         kmp_user_lock_p, const ident_t *))(&__kmp_set_drdpa_lock_location);
3775 
3776     __kmp_get_user_lock_flags_ =
3777         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_flags);
3778 
3779     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
3780         &__kmp_set_drdpa_lock_flags);
3781   } break;
3782   }
3783 }
3784 
3785 // ----------------------------------------------------------------------------
3786 // User lock table & lock allocation
3787 
3788 kmp_lock_table_t __kmp_user_lock_table = {1, 0, NULL};
3789 kmp_user_lock_p __kmp_lock_pool = NULL;
3790 
3791 // Lock block-allocation support.
3792 kmp_block_of_locks *__kmp_lock_blocks = NULL;
3793 int __kmp_num_locks_in_block = 1; // FIXME - tune this value
3794 
3795 static kmp_lock_index_t __kmp_lock_table_insert(kmp_user_lock_p lck) {
3796   // Assume that kmp_global_lock is held upon entry/exit.
3797   kmp_lock_index_t index;
3798   if (__kmp_user_lock_table.used >= __kmp_user_lock_table.allocated) {
3799     kmp_lock_index_t size;
3800     kmp_user_lock_p *table;
3801     // Reallocate lock table.
3802     if (__kmp_user_lock_table.allocated == 0) {
3803       size = 1024;
3804     } else {
3805       size = __kmp_user_lock_table.allocated * 2;
3806     }
3807     table = (kmp_user_lock_p *)__kmp_allocate(sizeof(kmp_user_lock_p) * size);
3808     KMP_MEMCPY(table + 1, __kmp_user_lock_table.table + 1,
3809                sizeof(kmp_user_lock_p) * (__kmp_user_lock_table.used - 1));
3810     table[0] = (kmp_user_lock_p)__kmp_user_lock_table.table;
3811     // We cannot free the previous table now, since it may be in use by other
3812     // threads. So save the pointer to the previous table in the first
3813     // element of the new table. All the tables will be organized into a list,
3814     // and could be freed when library shutting down.
3815     __kmp_user_lock_table.table = table;
3816     __kmp_user_lock_table.allocated = size;
3817   }
3818   KMP_DEBUG_ASSERT(__kmp_user_lock_table.used <
3819                    __kmp_user_lock_table.allocated);
3820   index = __kmp_user_lock_table.used;
3821   __kmp_user_lock_table.table[index] = lck;
3822   ++__kmp_user_lock_table.used;
3823   return index;
3824 }
3825 
3826 static kmp_user_lock_p __kmp_lock_block_allocate() {
3827   // Assume that kmp_global_lock is held upon entry/exit.
3828   static int last_index = 0;
3829   if ((last_index >= __kmp_num_locks_in_block) || (__kmp_lock_blocks == NULL)) {
3830     // Restart the index.
3831     last_index = 0;
3832     // Need to allocate a new block.
3833     KMP_DEBUG_ASSERT(__kmp_user_lock_size > 0);
3834     size_t space_for_locks = __kmp_user_lock_size * __kmp_num_locks_in_block;
3835     char *buffer =
3836         (char *)__kmp_allocate(space_for_locks + sizeof(kmp_block_of_locks));
3837     // Set up the new block.
3838     kmp_block_of_locks *new_block =
3839         (kmp_block_of_locks *)(&buffer[space_for_locks]);
3840     new_block->next_block = __kmp_lock_blocks;
3841     new_block->locks = (void *)buffer;
3842     // Publish the new block.
3843     KMP_MB();
3844     __kmp_lock_blocks = new_block;
3845   }
3846   kmp_user_lock_p ret = (kmp_user_lock_p)(&(
3847       ((char *)(__kmp_lock_blocks->locks))[last_index * __kmp_user_lock_size]));
3848   last_index++;
3849   return ret;
3850 }
3851 
3852 // Get memory for a lock. It may be freshly allocated memory or reused memory
3853 // from lock pool.
3854 kmp_user_lock_p __kmp_user_lock_allocate(void **user_lock, kmp_int32 gtid,
3855                                          kmp_lock_flags_t flags) {
3856   kmp_user_lock_p lck;
3857   kmp_lock_index_t index;
3858   KMP_DEBUG_ASSERT(user_lock);
3859 
3860   __kmp_acquire_lock(&__kmp_global_lock, gtid);
3861 
3862   if (__kmp_lock_pool == NULL) {
3863     // Lock pool is empty. Allocate new memory.
3864 
3865     if (__kmp_num_locks_in_block <= 1) { // Tune this cutoff point.
3866       lck = (kmp_user_lock_p)__kmp_allocate(__kmp_user_lock_size);
3867     } else {
3868       lck = __kmp_lock_block_allocate();
3869     }
3870 
3871     // Insert lock in the table so that it can be freed in __kmp_cleanup,
3872     // and debugger has info on all allocated locks.
3873     index = __kmp_lock_table_insert(lck);
3874   } else {
3875     // Pick up lock from pool.
3876     lck = __kmp_lock_pool;
3877     index = __kmp_lock_pool->pool.index;
3878     __kmp_lock_pool = __kmp_lock_pool->pool.next;
3879   }
3880 
3881   // We could potentially differentiate between nested and regular locks
3882   // here, and do the lock table lookup for regular locks only.
3883   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3884     *((kmp_lock_index_t *)user_lock) = index;
3885   } else {
3886     *((kmp_user_lock_p *)user_lock) = lck;
3887   }
3888 
3889   // mark the lock if it is critical section lock.
3890   __kmp_set_user_lock_flags(lck, flags);
3891 
3892   __kmp_release_lock(&__kmp_global_lock, gtid); // AC: TODO move this line upper
3893 
3894   return lck;
3895 }
3896 
3897 // Put lock's memory to pool for reusing.
3898 void __kmp_user_lock_free(void **user_lock, kmp_int32 gtid,
3899                           kmp_user_lock_p lck) {
3900   KMP_DEBUG_ASSERT(user_lock != NULL);
3901   KMP_DEBUG_ASSERT(lck != NULL);
3902 
3903   __kmp_acquire_lock(&__kmp_global_lock, gtid);
3904 
3905   lck->pool.next = __kmp_lock_pool;
3906   __kmp_lock_pool = lck;
3907   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3908     kmp_lock_index_t index = *((kmp_lock_index_t *)user_lock);
3909     KMP_DEBUG_ASSERT(0 < index && index <= __kmp_user_lock_table.used);
3910     lck->pool.index = index;
3911   }
3912 
3913   __kmp_release_lock(&__kmp_global_lock, gtid);
3914 }
3915 
3916 kmp_user_lock_p __kmp_lookup_user_lock(void **user_lock, char const *func) {
3917   kmp_user_lock_p lck = NULL;
3918 
3919   if (__kmp_env_consistency_check) {
3920     if (user_lock == NULL) {
3921       KMP_FATAL(LockIsUninitialized, func);
3922     }
3923   }
3924 
3925   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3926     kmp_lock_index_t index = *((kmp_lock_index_t *)user_lock);
3927     if (__kmp_env_consistency_check) {
3928       if (!(0 < index && index < __kmp_user_lock_table.used)) {
3929         KMP_FATAL(LockIsUninitialized, func);
3930       }
3931     }
3932     KMP_DEBUG_ASSERT(0 < index && index < __kmp_user_lock_table.used);
3933     KMP_DEBUG_ASSERT(__kmp_user_lock_size > 0);
3934     lck = __kmp_user_lock_table.table[index];
3935   } else {
3936     lck = *((kmp_user_lock_p *)user_lock);
3937   }
3938 
3939   if (__kmp_env_consistency_check) {
3940     if (lck == NULL) {
3941       KMP_FATAL(LockIsUninitialized, func);
3942     }
3943   }
3944 
3945   return lck;
3946 }
3947 
3948 void __kmp_cleanup_user_locks(void) {
3949   // Reset lock pool. Don't worry about lock in the pool--we will free them when
3950   // iterating through lock table (it includes all the locks, dead or alive).
3951   __kmp_lock_pool = NULL;
3952 
3953 #define IS_CRITICAL(lck)                                                       \
3954   ((__kmp_get_user_lock_flags_ != NULL) &&                                     \
3955    ((*__kmp_get_user_lock_flags_)(lck)&kmp_lf_critical_section))
3956 
3957   // Loop through lock table, free all locks.
3958   // Do not free item [0], it is reserved for lock tables list.
3959   //
3960   // FIXME - we are iterating through a list of (pointers to) objects of type
3961   // union kmp_user_lock, but we have no way of knowing whether the base type is
3962   // currently "pool" or whatever the global user lock type is.
3963   //
3964   // We are relying on the fact that for all of the user lock types
3965   // (except "tas"), the first field in the lock struct is the "initialized"
3966   // field, which is set to the address of the lock object itself when
3967   // the lock is initialized.  When the union is of type "pool", the
3968   // first field is a pointer to the next object in the free list, which
3969   // will not be the same address as the object itself.
3970   //
3971   // This means that the check (*__kmp_is_user_lock_initialized_)(lck) will fail
3972   // for "pool" objects on the free list.  This must happen as the "location"
3973   // field of real user locks overlaps the "index" field of "pool" objects.
3974   //
3975   // It would be better to run through the free list, and remove all "pool"
3976   // objects from the lock table before executing this loop.  However,
3977   // "pool" objects do not always have their index field set (only on
3978   // lin_32e), and I don't want to search the lock table for the address
3979   // of every "pool" object on the free list.
3980   while (__kmp_user_lock_table.used > 1) {
3981     const ident *loc;
3982 
3983     // reduce __kmp_user_lock_table.used before freeing the lock,
3984     // so that state of locks is consistent
3985     kmp_user_lock_p lck =
3986         __kmp_user_lock_table.table[--__kmp_user_lock_table.used];
3987 
3988     if ((__kmp_is_user_lock_initialized_ != NULL) &&
3989         (*__kmp_is_user_lock_initialized_)(lck)) {
3990       // Issue a warning if: KMP_CONSISTENCY_CHECK AND lock is initialized AND
3991       // it is NOT a critical section (user is not responsible for destroying
3992       // criticals) AND we know source location to report.
3993       if (__kmp_env_consistency_check && (!IS_CRITICAL(lck)) &&
3994           ((loc = __kmp_get_user_lock_location(lck)) != NULL) &&
3995           (loc->psource != NULL)) {
3996         kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
3997         KMP_WARNING(CnsLockNotDestroyed, str_loc.file, str_loc.line);
3998         __kmp_str_loc_free(&str_loc);
3999       }
4000 
4001 #ifdef KMP_DEBUG
4002       if (IS_CRITICAL(lck)) {
4003         KA_TRACE(
4004             20,
4005             ("__kmp_cleanup_user_locks: free critical section lock %p (%p)\n",
4006              lck, *(void **)lck));
4007       } else {
4008         KA_TRACE(20, ("__kmp_cleanup_user_locks: free lock %p (%p)\n", lck,
4009                       *(void **)lck));
4010       }
4011 #endif // KMP_DEBUG
4012 
4013       // Cleanup internal lock dynamic resources (for drdpa locks particularly).
4014       __kmp_destroy_user_lock(lck);
4015     }
4016 
4017     // Free the lock if block allocation of locks is not used.
4018     if (__kmp_lock_blocks == NULL) {
4019       __kmp_free(lck);
4020     }
4021   }
4022 
4023 #undef IS_CRITICAL
4024 
4025   // delete lock table(s).
4026   kmp_user_lock_p *table_ptr = __kmp_user_lock_table.table;
4027   __kmp_user_lock_table.table = NULL;
4028   __kmp_user_lock_table.allocated = 0;
4029 
4030   while (table_ptr != NULL) {
4031     // In the first element we saved the pointer to the previous
4032     // (smaller) lock table.
4033     kmp_user_lock_p *next = (kmp_user_lock_p *)(table_ptr[0]);
4034     __kmp_free(table_ptr);
4035     table_ptr = next;
4036   }
4037 
4038   // Free buffers allocated for blocks of locks.
4039   kmp_block_of_locks_t *block_ptr = __kmp_lock_blocks;
4040   __kmp_lock_blocks = NULL;
4041 
4042   while (block_ptr != NULL) {
4043     kmp_block_of_locks_t *next = block_ptr->next_block;
4044     __kmp_free(block_ptr->locks);
4045     // *block_ptr itself was allocated at the end of the locks vector.
4046     block_ptr = next;
4047   }
4048 
4049   TCW_4(__kmp_init_user_locks, FALSE);
4050 }
4051 
4052 #endif // KMP_USE_DYNAMIC_LOCK
4053