1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2015, Joyent, Inc.
26 * Copyright (c) 2016 by Delphix. All rights reserved.
27 * Copyright 2024 Oxide Computer Company
28 */
29
30 #include "lint.h"
31 #include "thr_uberdata.h"
32 #include <sys/rtpriocntl.h>
33 #include <sys/sdt.h>
34 #include <atomic.h>
35
36 #if defined(DEBUG)
37 #define INCR32(x) (((x) != UINT32_MAX)? (x)++ : 0)
38 #define INCR(x) ((x)++)
39 #define DECR(x) ((x)--)
40 #define MAXINCR(m, x) ((m < ++x)? (m = x) : 0)
41 #else
42 #define INCR32(x)
43 #define INCR(x)
44 #define DECR(x)
45 #define MAXINCR(m, x)
46 #endif
47
48 /*
49 * This mutex is initialized to be held by lwp#1.
50 * It is used to block a thread that has returned from a mutex_lock()
51 * of a LOCK_PRIO_INHERIT mutex with an unrecoverable error.
52 */
53 mutex_t stall_mutex = DEFAULTMUTEX;
54
55 static int shared_mutex_held(mutex_t *);
56 static int mutex_queuelock_adaptive(mutex_t *);
57 static void mutex_wakeup_all(mutex_t *);
58
59 /*
60 * Lock statistics support functions.
61 */
62 void
record_begin_hold(tdb_mutex_stats_t * msp)63 record_begin_hold(tdb_mutex_stats_t *msp)
64 {
65 tdb_incr(msp->mutex_lock);
66 msp->mutex_begin_hold = gethrtime();
67 }
68
69 hrtime_t
record_hold_time(tdb_mutex_stats_t * msp)70 record_hold_time(tdb_mutex_stats_t *msp)
71 {
72 hrtime_t now = gethrtime();
73
74 if (msp->mutex_begin_hold)
75 msp->mutex_hold_time += now - msp->mutex_begin_hold;
76 msp->mutex_begin_hold = 0;
77 return (now);
78 }
79
80 /*
81 * Called once at library initialization.
82 */
83 void
mutex_setup(void)84 mutex_setup(void)
85 {
86 if (set_lock_byte(&stall_mutex.mutex_lockw))
87 thr_panic("mutex_setup() cannot acquire stall_mutex");
88 stall_mutex.mutex_owner = (uintptr_t)curthread;
89 }
90
91 /*
92 * The default spin count of 1000 is experimentally determined.
93 * On sun4u machines with any number of processors it could be raised
94 * to 10,000 but that (experimentally) makes almost no difference.
95 * The environment variable:
96 * _THREAD_ADAPTIVE_SPIN=count
97 * can be used to override and set the count in the range [0 .. 1,000,000].
98 */
99 int thread_adaptive_spin = 1000;
100 uint_t thread_max_spinners = 100;
101 int thread_queue_verify = 0;
102 static int ncpus;
103
104 /*
105 * Distinguish spinning for queue locks from spinning for regular locks.
106 * We try harder to acquire queue locks by spinning.
107 * The environment variable:
108 * _THREAD_QUEUE_SPIN=count
109 * can be used to override and set the count in the range [0 .. 1,000,000].
110 */
111 int thread_queue_spin = 10000;
112
113 #define ALL_ATTRIBUTES \
114 (LOCK_RECURSIVE | LOCK_ERRORCHECK | \
115 LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT | \
116 LOCK_ROBUST)
117
118 /*
119 * 'type' can be one of USYNC_THREAD, USYNC_PROCESS, or USYNC_PROCESS_ROBUST,
120 * augmented by zero or more the flags:
121 * LOCK_RECURSIVE
122 * LOCK_ERRORCHECK
123 * LOCK_PRIO_INHERIT
124 * LOCK_PRIO_PROTECT
125 * LOCK_ROBUST
126 */
127 #pragma weak _mutex_init = mutex_init
128 /* ARGSUSED2 */
129 int
mutex_init(mutex_t * mp,int type,void * arg)130 mutex_init(mutex_t *mp, int type, void *arg)
131 {
132 int basetype = (type & ~ALL_ATTRIBUTES);
133 const pcclass_t *pccp;
134 int error = 0;
135 int ceil;
136
137 if (basetype == USYNC_PROCESS_ROBUST) {
138 /*
139 * USYNC_PROCESS_ROBUST is a deprecated historical type.
140 * We change it into (USYNC_PROCESS | LOCK_ROBUST) but
141 * retain the USYNC_PROCESS_ROBUST flag so we can return
142 * ELOCKUNMAPPED when necessary (only USYNC_PROCESS_ROBUST
143 * mutexes will ever draw ELOCKUNMAPPED).
144 */
145 type |= (USYNC_PROCESS | LOCK_ROBUST);
146 basetype = USYNC_PROCESS;
147 }
148
149 if (type & LOCK_PRIO_PROTECT)
150 pccp = get_info_by_policy(SCHED_FIFO);
151 if ((basetype != USYNC_THREAD && basetype != USYNC_PROCESS) ||
152 (type & (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT))
153 == (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT) ||
154 ((type & LOCK_PRIO_PROTECT) &&
155 ((ceil = *(int *)arg) < pccp->pcc_primin ||
156 ceil > pccp->pcc_primax))) {
157 error = EINVAL;
158 } else if (type & LOCK_ROBUST) {
159 /*
160 * Callers of mutex_init() with the LOCK_ROBUST attribute
161 * are required to pass an initially all-zero mutex.
162 * Multiple calls to mutex_init() are allowed; all but
163 * the first return EBUSY. A call to mutex_init() is
164 * allowed to make an inconsistent robust lock consistent
165 * (for historical usage, even though the proper interface
166 * for this is mutex_consistent()). Note that we use
167 * atomic_or_16() to set the LOCK_INITED flag so as
168 * not to disturb surrounding bits (LOCK_OWNERDEAD, etc).
169 */
170 if (!(mp->mutex_flag & LOCK_INITED)) {
171 mp->mutex_type = (uint8_t)type;
172 atomic_or_16(&mp->mutex_flag, LOCK_INITED);
173 mp->mutex_magic = MUTEX_MAGIC;
174 } else if (type != mp->mutex_type ||
175 ((type & LOCK_PRIO_PROTECT) && mp->mutex_ceiling != ceil)) {
176 error = EINVAL;
177 } else if (mutex_consistent(mp) != 0) {
178 error = EBUSY;
179 }
180 /* register a process robust mutex with the kernel */
181 if (basetype == USYNC_PROCESS)
182 register_lock(mp);
183 } else {
184 (void) memset(mp, 0, sizeof (*mp));
185 mp->mutex_type = (uint8_t)type;
186 mp->mutex_flag = LOCK_INITED;
187 mp->mutex_magic = MUTEX_MAGIC;
188 }
189
190 if (error == 0 && (type & LOCK_PRIO_PROTECT)) {
191 mp->mutex_ceiling = ceil;
192 }
193
194 /*
195 * This should be at the beginning of the function,
196 * but for the sake of old broken applications that
197 * do not have proper alignment for their mutexes
198 * (and don't check the return code from mutex_init),
199 * we put it here, after initializing the mutex regardless.
200 */
201 if (error == 0 &&
202 ((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
203 curthread->ul_misaligned == 0)
204 error = EINVAL;
205
206 return (error);
207 }
208
209 /*
210 * Delete mp from list of ceiling mutexes owned by curthread.
211 * Return 1 if the head of the chain was updated.
212 */
213 int
_ceil_mylist_del(mutex_t * mp)214 _ceil_mylist_del(mutex_t *mp)
215 {
216 ulwp_t *self = curthread;
217 mxchain_t **mcpp;
218 mxchain_t *mcp;
219
220 for (mcpp = &self->ul_mxchain;
221 (mcp = *mcpp) != NULL;
222 mcpp = &mcp->mxchain_next) {
223 if (mcp->mxchain_mx == mp) {
224 *mcpp = mcp->mxchain_next;
225 lfree(mcp, sizeof (*mcp));
226 return (mcpp == &self->ul_mxchain);
227 }
228 }
229 return (0);
230 }
231
232 /*
233 * Add mp to the list of ceiling mutexes owned by curthread.
234 * Return ENOMEM if no memory could be allocated.
235 */
236 int
_ceil_mylist_add(mutex_t * mp)237 _ceil_mylist_add(mutex_t *mp)
238 {
239 ulwp_t *self = curthread;
240 mxchain_t *mcp;
241
242 if ((mcp = lmalloc(sizeof (*mcp))) == NULL)
243 return (ENOMEM);
244 mcp->mxchain_mx = mp;
245 mcp->mxchain_next = self->ul_mxchain;
246 self->ul_mxchain = mcp;
247 return (0);
248 }
249
250 /*
251 * Helper function for _ceil_prio_inherit() and _ceil_prio_waive(), below.
252 */
253 static void
set_rt_priority(ulwp_t * self,int prio)254 set_rt_priority(ulwp_t *self, int prio)
255 {
256 pcparms_t pcparm;
257
258 pcparm.pc_cid = self->ul_rtclassid;
259 ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = RT_NOCHANGE;
260 ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
261 (void) priocntl(P_LWPID, self->ul_lwpid, PC_SETPARMS, &pcparm);
262 }
263
264 /*
265 * Inherit priority from ceiling.
266 * This changes the effective priority, not the assigned priority.
267 */
268 void
_ceil_prio_inherit(int prio)269 _ceil_prio_inherit(int prio)
270 {
271 ulwp_t *self = curthread;
272
273 self->ul_epri = prio;
274 set_rt_priority(self, prio);
275 }
276
277 /*
278 * Waive inherited ceiling priority. Inherit from head of owned ceiling locks
279 * if holding at least one ceiling lock. If no ceiling locks are held at this
280 * point, disinherit completely, reverting back to assigned priority.
281 */
282 void
_ceil_prio_waive(void)283 _ceil_prio_waive(void)
284 {
285 ulwp_t *self = curthread;
286 mxchain_t *mcp = self->ul_mxchain;
287 int prio;
288
289 if (mcp == NULL) {
290 prio = self->ul_pri;
291 self->ul_epri = 0;
292 } else {
293 prio = mcp->mxchain_mx->mutex_ceiling;
294 self->ul_epri = prio;
295 }
296 set_rt_priority(self, prio);
297 }
298
299 /*
300 * Clear the lock byte. Retain the waiters byte and the spinners byte.
301 * Return the old value of the lock word.
302 */
303 static uint32_t
clear_lockbyte(volatile uint32_t * lockword)304 clear_lockbyte(volatile uint32_t *lockword)
305 {
306 uint32_t old;
307 uint32_t new;
308
309 do {
310 old = *lockword;
311 new = old & ~LOCKMASK;
312 } while (atomic_cas_32(lockword, old, new) != old);
313
314 return (old);
315 }
316
317 /*
318 * Same as clear_lockbyte(), but operates on mutex_lockword64.
319 * The mutex_ownerpid field is cleared along with the lock byte.
320 */
321 static uint64_t
clear_lockbyte64(volatile uint64_t * lockword64)322 clear_lockbyte64(volatile uint64_t *lockword64)
323 {
324 uint64_t old;
325 uint64_t new;
326
327 do {
328 old = *lockword64;
329 new = old & ~LOCKMASK64;
330 } while (atomic_cas_64(lockword64, old, new) != old);
331
332 return (old);
333 }
334
335 /*
336 * Similar to set_lock_byte(), which only tries to set the lock byte.
337 * Here, we attempt to set the lock byte AND the mutex_ownerpid, keeping
338 * the remaining bytes constant. This atomic operation is required for the
339 * correctness of process-shared robust locks, otherwise there would be
340 * a window or vulnerability in which the lock byte had been set but the
341 * mutex_ownerpid had not yet been set. If the process were to die in
342 * this window of vulnerability (due to some other thread calling exit()
343 * or the process receiving a fatal signal), the mutex would be left locked
344 * but without a process-ID to determine which process was holding the lock.
345 * The kernel would then be unable to mark the robust mutex as LOCK_OWNERDEAD
346 * when the process died. For all other cases of process-shared locks, this
347 * operation is just a convenience, for the sake of common code.
348 *
349 * This operation requires process-shared robust locks to be properly
350 * aligned on an 8-byte boundary, at least on sparc machines, lest the
351 * operation incur an alignment fault. This is automatic when locks
352 * are declared properly using the mutex_t or pthread_mutex_t data types
353 * and the application does not allocate dynamic memory on less than an
354 * 8-byte boundary. See the 'horrible hack' comments below for cases
355 * dealing with such broken applications.
356 */
357 static int
set_lock_byte64(volatile uint64_t * lockword64,pid_t ownerpid)358 set_lock_byte64(volatile uint64_t *lockword64, pid_t ownerpid)
359 {
360 uint64_t old;
361 uint64_t new;
362
363 old = *lockword64 & ~LOCKMASK64;
364 new = old | ((uint64_t)(uint_t)ownerpid << PIDSHIFT) | LOCKBYTE64;
365 if (atomic_cas_64(lockword64, old, new) == old)
366 return (LOCKCLEAR);
367
368 return (LOCKSET);
369 }
370
371 /*
372 * Increment the spinners count in the mutex lock word.
373 * Return 0 on success. Return -1 if the count would overflow.
374 */
375 static int
spinners_incr(volatile uint32_t * lockword,uint8_t max_spinners)376 spinners_incr(volatile uint32_t *lockword, uint8_t max_spinners)
377 {
378 uint32_t old;
379 uint32_t new;
380
381 do {
382 old = *lockword;
383 if (((old & SPINNERMASK) >> SPINNERSHIFT) >= max_spinners)
384 return (-1);
385 new = old + (1 << SPINNERSHIFT);
386 } while (atomic_cas_32(lockword, old, new) != old);
387
388 return (0);
389 }
390
391 /*
392 * Decrement the spinners count in the mutex lock word.
393 * Return the new value of the lock word.
394 */
395 static uint32_t
spinners_decr(volatile uint32_t * lockword)396 spinners_decr(volatile uint32_t *lockword)
397 {
398 uint32_t old;
399 uint32_t new;
400
401 do {
402 new = old = *lockword;
403 if (new & SPINNERMASK)
404 new -= (1 << SPINNERSHIFT);
405 } while (atomic_cas_32(lockword, old, new) != old);
406
407 return (new);
408 }
409
410 /*
411 * Non-preemptive spin locks. Used by queue_lock().
412 * No lock statistics are gathered for these locks.
413 * No DTrace probes are provided for these locks.
414 */
415 void
spin_lock_set(mutex_t * mp)416 spin_lock_set(mutex_t *mp)
417 {
418 ulwp_t *self = curthread;
419
420 no_preempt(self);
421 if (set_lock_byte(&mp->mutex_lockw) == 0) {
422 mp->mutex_owner = (uintptr_t)self;
423 return;
424 }
425 /*
426 * Spin for a while, attempting to acquire the lock.
427 */
428 INCR32(self->ul_spin_lock_spin);
429 if (mutex_queuelock_adaptive(mp) == 0 ||
430 set_lock_byte(&mp->mutex_lockw) == 0) {
431 mp->mutex_owner = (uintptr_t)self;
432 return;
433 }
434 /*
435 * Try harder if we were previously at a no premption level.
436 */
437 if (self->ul_preempt > 1) {
438 INCR32(self->ul_spin_lock_spin2);
439 if (mutex_queuelock_adaptive(mp) == 0 ||
440 set_lock_byte(&mp->mutex_lockw) == 0) {
441 mp->mutex_owner = (uintptr_t)self;
442 return;
443 }
444 }
445 /*
446 * Give up and block in the kernel for the mutex.
447 */
448 INCR32(self->ul_spin_lock_sleep);
449 (void) ___lwp_mutex_timedlock(mp, NULL, self);
450 }
451
452 void
spin_lock_clear(mutex_t * mp)453 spin_lock_clear(mutex_t *mp)
454 {
455 ulwp_t *self = curthread;
456
457 mp->mutex_owner = 0;
458 if (atomic_swap_32(&mp->mutex_lockword, 0) & WAITERMASK) {
459 (void) ___lwp_mutex_wakeup(mp, 0);
460 INCR32(self->ul_spin_lock_wakeup);
461 }
462 preempt(self);
463 }
464
465 /*
466 * Allocate the sleep queue hash table.
467 */
468 void
queue_alloc(void)469 queue_alloc(void)
470 {
471 ulwp_t *self = curthread;
472 uberdata_t *udp = self->ul_uberdata;
473 queue_head_t *qp;
474 void *data;
475 int i;
476
477 /*
478 * No locks are needed; we call here only when single-threaded.
479 */
480 ASSERT(self == udp->ulwp_one);
481 ASSERT(!udp->uberflags.uf_mt);
482 if ((data = mmap(NULL, 2 * QHASHSIZE * sizeof (queue_head_t),
483 PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, (off_t)0))
484 == MAP_FAILED)
485 thr_panic("cannot allocate thread queue_head table");
486 udp->queue_head = qp = (queue_head_t *)data;
487 for (i = 0; i < 2 * QHASHSIZE; qp++, i++) {
488 qp->qh_type = (i < QHASHSIZE)? MX : CV;
489 qp->qh_lock.mutex_flag = LOCK_INITED;
490 qp->qh_lock.mutex_magic = MUTEX_MAGIC;
491 qp->qh_hlist = &qp->qh_def_root;
492 #if defined(DEBUG)
493 qp->qh_hlen = 1;
494 qp->qh_hmax = 1;
495 #endif
496 }
497 }
498
499 #if defined(DEBUG)
500
501 /*
502 * Debugging: verify correctness of a sleep queue.
503 */
504 void
QVERIFY(queue_head_t * qp)505 QVERIFY(queue_head_t *qp)
506 {
507 ulwp_t *self = curthread;
508 uberdata_t *udp = self->ul_uberdata;
509 queue_root_t *qrp;
510 ulwp_t *ulwp;
511 ulwp_t *prev;
512 uint_t index;
513 uint32_t cnt;
514 char qtype;
515 void *wchan;
516
517 ASSERT(qp >= udp->queue_head && (qp - udp->queue_head) < 2 * QHASHSIZE);
518 ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
519 for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
520 cnt++;
521 ASSERT((qrp->qr_head != NULL && qrp->qr_tail != NULL) ||
522 (qrp->qr_head == NULL && qrp->qr_tail == NULL));
523 }
524 ASSERT(qp->qh_hlen == cnt && qp->qh_hmax >= cnt);
525 qtype = ((qp - udp->queue_head) < QHASHSIZE)? MX : CV;
526 ASSERT(qp->qh_type == qtype);
527 if (!thread_queue_verify)
528 return;
529 /* real expensive stuff, only for _THREAD_QUEUE_VERIFY */
530 for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
531 for (prev = NULL, ulwp = qrp->qr_head; ulwp != NULL;
532 prev = ulwp, ulwp = ulwp->ul_link) {
533 cnt++;
534 if (ulwp->ul_writer)
535 ASSERT(prev == NULL || prev->ul_writer);
536 ASSERT(ulwp->ul_qtype == qtype);
537 ASSERT(ulwp->ul_wchan != NULL);
538 ASSERT(ulwp->ul_sleepq == qp);
539 wchan = ulwp->ul_wchan;
540 ASSERT(qrp->qr_wchan == wchan);
541 index = QUEUE_HASH(wchan, qtype);
542 ASSERT(&udp->queue_head[index] == qp);
543 }
544 ASSERT(qrp->qr_tail == prev);
545 }
546 ASSERT(qp->qh_qlen == cnt);
547 }
548
549 #else /* DEBUG */
550
551 #define QVERIFY(qp)
552
553 #endif /* DEBUG */
554
555 /*
556 * Acquire a queue head.
557 */
558 queue_head_t *
queue_lock(void * wchan,int qtype)559 queue_lock(void *wchan, int qtype)
560 {
561 uberdata_t *udp = curthread->ul_uberdata;
562 queue_head_t *qp;
563 queue_root_t *qrp;
564
565 ASSERT(qtype == MX || qtype == CV);
566
567 /*
568 * It is possible that we could be called while still single-threaded.
569 * If so, we call queue_alloc() to allocate the queue_head[] array.
570 */
571 if ((qp = udp->queue_head) == NULL) {
572 queue_alloc();
573 qp = udp->queue_head;
574 }
575 qp += QUEUE_HASH(wchan, qtype);
576 spin_lock_set(&qp->qh_lock);
577 for (qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next)
578 if (qrp->qr_wchan == wchan)
579 break;
580 if (qrp == NULL && qp->qh_def_root.qr_head == NULL) {
581 /* the default queue root is available; use it */
582 qrp = &qp->qh_def_root;
583 qrp->qr_wchan = wchan;
584 ASSERT(qrp->qr_next == NULL);
585 ASSERT(qrp->qr_tail == NULL &&
586 qrp->qr_rtcount == 0 && qrp->qr_qlen == 0);
587 }
588 qp->qh_wchan = wchan; /* valid until queue_unlock() is called */
589 qp->qh_root = qrp; /* valid until queue_unlock() is called */
590 INCR32(qp->qh_lockcount);
591 QVERIFY(qp);
592 return (qp);
593 }
594
595 /*
596 * Release a queue head.
597 */
598 void
queue_unlock(queue_head_t * qp)599 queue_unlock(queue_head_t *qp)
600 {
601 QVERIFY(qp);
602 spin_lock_clear(&qp->qh_lock);
603 }
604
605 /*
606 * For rwlock queueing, we must queue writers ahead of readers of the
607 * same priority. We do this by making writers appear to have a half
608 * point higher priority for purposes of priority comparisons below.
609 */
610 #define CMP_PRIO(ulwp) ((real_priority(ulwp) << 1) + (ulwp)->ul_writer)
611
612 void
enqueue(queue_head_t * qp,ulwp_t * ulwp,int force_fifo)613 enqueue(queue_head_t *qp, ulwp_t *ulwp, int force_fifo)
614 {
615 queue_root_t *qrp;
616 ulwp_t **ulwpp;
617 ulwp_t *next;
618 int pri = CMP_PRIO(ulwp);
619
620 ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
621 ASSERT(ulwp->ul_sleepq != qp);
622
623 if ((qrp = qp->qh_root) == NULL) {
624 /* use the thread's queue root for the linkage */
625 qrp = &ulwp->ul_queue_root;
626 qrp->qr_next = qp->qh_hlist;
627 qrp->qr_prev = NULL;
628 qrp->qr_head = NULL;
629 qrp->qr_tail = NULL;
630 qrp->qr_wchan = qp->qh_wchan;
631 qrp->qr_rtcount = 0;
632 qrp->qr_qlen = 0;
633 qrp->qr_qmax = 0;
634 qp->qh_hlist->qr_prev = qrp;
635 qp->qh_hlist = qrp;
636 qp->qh_root = qrp;
637 MAXINCR(qp->qh_hmax, qp->qh_hlen);
638 }
639
640 /*
641 * LIFO queue ordering is unfair and can lead to starvation,
642 * but it gives better performance for heavily contended locks.
643 * We use thread_queue_fifo (range is 0..8) to determine
644 * the frequency of FIFO vs LIFO queuing:
645 * 0 : every 256th time (almost always LIFO)
646 * 1 : every 128th time
647 * 2 : every 64th time
648 * 3 : every 32nd time
649 * 4 : every 16th time (the default value, mostly LIFO)
650 * 5 : every 8th time
651 * 6 : every 4th time
652 * 7 : every 2nd time
653 * 8 : every time (never LIFO, always FIFO)
654 * Note that there is always some degree of FIFO ordering.
655 * This breaks live lock conditions that occur in applications
656 * that are written assuming (incorrectly) that threads acquire
657 * locks fairly, that is, in roughly round-robin order.
658 * In any event, the queue is maintained in kernel priority order.
659 *
660 * If force_fifo is non-zero, fifo queueing is forced.
661 * SUSV3 requires this for semaphores.
662 */
663 if (qrp->qr_head == NULL) {
664 /*
665 * The queue is empty. LIFO/FIFO doesn't matter.
666 */
667 ASSERT(qrp->qr_tail == NULL);
668 ulwpp = &qrp->qr_head;
669 } else if (force_fifo |
670 (((++qp->qh_qcnt << curthread->ul_queue_fifo) & 0xff) == 0)) {
671 /*
672 * Enqueue after the last thread whose priority is greater
673 * than or equal to the priority of the thread being queued.
674 * Attempt first to go directly onto the tail of the queue.
675 */
676 if (pri <= CMP_PRIO(qrp->qr_tail))
677 ulwpp = &qrp->qr_tail->ul_link;
678 else {
679 for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
680 ulwpp = &next->ul_link)
681 if (pri > CMP_PRIO(next))
682 break;
683 }
684 } else {
685 /*
686 * Enqueue before the first thread whose priority is less
687 * than or equal to the priority of the thread being queued.
688 * Hopefully we can go directly onto the head of the queue.
689 */
690 for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
691 ulwpp = &next->ul_link)
692 if (pri >= CMP_PRIO(next))
693 break;
694 }
695 if ((ulwp->ul_link = *ulwpp) == NULL)
696 qrp->qr_tail = ulwp;
697 *ulwpp = ulwp;
698
699 ulwp->ul_sleepq = qp;
700 ulwp->ul_wchan = qp->qh_wchan;
701 ulwp->ul_qtype = qp->qh_type;
702 if ((ulwp->ul_schedctl != NULL &&
703 ulwp->ul_schedctl->sc_cid == ulwp->ul_rtclassid) |
704 ulwp->ul_pilocks) {
705 ulwp->ul_rtqueued = 1;
706 qrp->qr_rtcount++;
707 }
708 MAXINCR(qrp->qr_qmax, qrp->qr_qlen);
709 MAXINCR(qp->qh_qmax, qp->qh_qlen);
710 }
711
712 /*
713 * Helper function for queue_slot() and queue_slot_rt().
714 * Try to find a non-suspended thread on the queue.
715 */
716 static ulwp_t **
queue_slot_runnable(ulwp_t ** ulwpp,ulwp_t ** prevp,int rt)717 queue_slot_runnable(ulwp_t **ulwpp, ulwp_t **prevp, int rt)
718 {
719 ulwp_t *ulwp;
720 ulwp_t **foundpp = NULL;
721 int priority = -1;
722 ulwp_t *prev;
723 int tpri;
724
725 for (prev = NULL;
726 (ulwp = *ulwpp) != NULL;
727 prev = ulwp, ulwpp = &ulwp->ul_link) {
728 if (ulwp->ul_stop) /* skip suspended threads */
729 continue;
730 tpri = rt? CMP_PRIO(ulwp) : 0;
731 if (tpri > priority) {
732 foundpp = ulwpp;
733 *prevp = prev;
734 priority = tpri;
735 if (!rt)
736 break;
737 }
738 }
739 return (foundpp);
740 }
741
742 /*
743 * For real-time, we search the entire queue because the dispatch
744 * (kernel) priorities may have changed since enqueueing.
745 */
746 static ulwp_t **
queue_slot_rt(ulwp_t ** ulwpp_org,ulwp_t ** prevp)747 queue_slot_rt(ulwp_t **ulwpp_org, ulwp_t **prevp)
748 {
749 ulwp_t **ulwpp = ulwpp_org;
750 ulwp_t *ulwp = *ulwpp;
751 ulwp_t **foundpp = ulwpp;
752 int priority = CMP_PRIO(ulwp);
753 ulwp_t *prev;
754 int tpri;
755
756 for (prev = ulwp, ulwpp = &ulwp->ul_link;
757 (ulwp = *ulwpp) != NULL;
758 prev = ulwp, ulwpp = &ulwp->ul_link) {
759 tpri = CMP_PRIO(ulwp);
760 if (tpri > priority) {
761 foundpp = ulwpp;
762 *prevp = prev;
763 priority = tpri;
764 }
765 }
766 ulwp = *foundpp;
767
768 /*
769 * Try not to return a suspended thread.
770 * This mimics the old libthread's behavior.
771 */
772 if (ulwp->ul_stop &&
773 (ulwpp = queue_slot_runnable(ulwpp_org, prevp, 1)) != NULL) {
774 foundpp = ulwpp;
775 ulwp = *foundpp;
776 }
777 ulwp->ul_rt = 1;
778 return (foundpp);
779 }
780
781 ulwp_t **
queue_slot(queue_head_t * qp,ulwp_t ** prevp,int * more)782 queue_slot(queue_head_t *qp, ulwp_t **prevp, int *more)
783 {
784 queue_root_t *qrp;
785 ulwp_t **ulwpp;
786 ulwp_t *ulwp;
787 int rt;
788
789 ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
790
791 if ((qrp = qp->qh_root) == NULL || (ulwp = qrp->qr_head) == NULL) {
792 *more = 0;
793 return (NULL); /* no lwps on the queue */
794 }
795 rt = (qrp->qr_rtcount != 0);
796 *prevp = NULL;
797 if (ulwp->ul_link == NULL) { /* only one lwp on the queue */
798 *more = 0;
799 ulwp->ul_rt = rt;
800 return (&qrp->qr_head);
801 }
802 *more = 1;
803
804 if (rt) /* real-time queue */
805 return (queue_slot_rt(&qrp->qr_head, prevp));
806 /*
807 * Try not to return a suspended thread.
808 * This mimics the old libthread's behavior.
809 */
810 if (ulwp->ul_stop &&
811 (ulwpp = queue_slot_runnable(&qrp->qr_head, prevp, 0)) != NULL) {
812 ulwp = *ulwpp;
813 ulwp->ul_rt = 0;
814 return (ulwpp);
815 }
816 /*
817 * The common case; just pick the first thread on the queue.
818 */
819 ulwp->ul_rt = 0;
820 return (&qrp->qr_head);
821 }
822
823 /*
824 * Common code for unlinking an lwp from a user-level sleep queue.
825 */
826 void
queue_unlink(queue_head_t * qp,ulwp_t ** ulwpp,ulwp_t * prev)827 queue_unlink(queue_head_t *qp, ulwp_t **ulwpp, ulwp_t *prev)
828 {
829 queue_root_t *qrp = qp->qh_root;
830 queue_root_t *nqrp;
831 ulwp_t *ulwp = *ulwpp;
832 ulwp_t *next;
833
834 ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
835 ASSERT(qp->qh_wchan != NULL && ulwp->ul_wchan == qp->qh_wchan);
836
837 DECR(qp->qh_qlen);
838 DECR(qrp->qr_qlen);
839 if (ulwp->ul_rtqueued) {
840 ulwp->ul_rtqueued = 0;
841 qrp->qr_rtcount--;
842 }
843 next = ulwp->ul_link;
844 *ulwpp = next;
845 ulwp->ul_link = NULL;
846 if (qrp->qr_tail == ulwp)
847 qrp->qr_tail = prev;
848 if (qrp == &ulwp->ul_queue_root) {
849 /*
850 * We can't continue to use the unlinked thread's
851 * queue root for the linkage.
852 */
853 queue_root_t *qr_next = qrp->qr_next;
854 queue_root_t *qr_prev = qrp->qr_prev;
855
856 if (qrp->qr_tail) {
857 /* switch to using the last thread's queue root */
858 ASSERT(qrp->qr_qlen != 0);
859 nqrp = &qrp->qr_tail->ul_queue_root;
860 *nqrp = *qrp;
861 if (qr_next)
862 qr_next->qr_prev = nqrp;
863 if (qr_prev)
864 qr_prev->qr_next = nqrp;
865 else
866 qp->qh_hlist = nqrp;
867 qp->qh_root = nqrp;
868 } else {
869 /* empty queue root; just delete from the hash list */
870 ASSERT(qrp->qr_qlen == 0);
871 if (qr_next)
872 qr_next->qr_prev = qr_prev;
873 if (qr_prev)
874 qr_prev->qr_next = qr_next;
875 else
876 qp->qh_hlist = qr_next;
877 qp->qh_root = NULL;
878 DECR(qp->qh_hlen);
879 }
880 }
881 }
882
883 ulwp_t *
dequeue(queue_head_t * qp,int * more)884 dequeue(queue_head_t *qp, int *more)
885 {
886 ulwp_t **ulwpp;
887 ulwp_t *ulwp;
888 ulwp_t *prev;
889
890 if ((ulwpp = queue_slot(qp, &prev, more)) == NULL)
891 return (NULL);
892 ulwp = *ulwpp;
893 queue_unlink(qp, ulwpp, prev);
894 ulwp->ul_sleepq = NULL;
895 ulwp->ul_wchan = NULL;
896 return (ulwp);
897 }
898
899 /*
900 * Return a pointer to the highest priority thread sleeping on wchan.
901 */
902 ulwp_t *
queue_waiter(queue_head_t * qp)903 queue_waiter(queue_head_t *qp)
904 {
905 ulwp_t **ulwpp;
906 ulwp_t *prev;
907 int more;
908
909 if ((ulwpp = queue_slot(qp, &prev, &more)) == NULL)
910 return (NULL);
911 return (*ulwpp);
912 }
913
914 int
dequeue_self(queue_head_t * qp)915 dequeue_self(queue_head_t *qp)
916 {
917 ulwp_t *self = curthread;
918 queue_root_t *qrp;
919 ulwp_t **ulwpp;
920 ulwp_t *ulwp;
921 ulwp_t *prev;
922 int found = 0;
923
924 ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
925
926 /* find self on the sleep queue */
927 if ((qrp = qp->qh_root) != NULL) {
928 for (prev = NULL, ulwpp = &qrp->qr_head;
929 (ulwp = *ulwpp) != NULL;
930 prev = ulwp, ulwpp = &ulwp->ul_link) {
931 if (ulwp == self) {
932 queue_unlink(qp, ulwpp, prev);
933 self->ul_cvmutex = NULL;
934 self->ul_sleepq = NULL;
935 self->ul_wchan = NULL;
936 found = 1;
937 break;
938 }
939 }
940 }
941
942 if (!found)
943 thr_panic("dequeue_self(): curthread not found on queue");
944
945 return ((qrp = qp->qh_root) != NULL && qrp->qr_head != NULL);
946 }
947
948 /*
949 * Called from call_user_handler() and _thrp_suspend() to take
950 * ourself off of our sleep queue so we can grab locks.
951 */
952 void
unsleep_self(void)953 unsleep_self(void)
954 {
955 ulwp_t *self = curthread;
956 queue_head_t *qp;
957
958 /*
959 * Calling enter_critical()/exit_critical() here would lead
960 * to recursion. Just manipulate self->ul_critical directly.
961 */
962 self->ul_critical++;
963 while (self->ul_sleepq != NULL) {
964 qp = queue_lock(self->ul_wchan, self->ul_qtype);
965 /*
966 * We may have been moved from a CV queue to a
967 * mutex queue while we were attempting queue_lock().
968 * If so, just loop around and try again.
969 * dequeue_self() clears self->ul_sleepq.
970 */
971 if (qp == self->ul_sleepq)
972 (void) dequeue_self(qp);
973 queue_unlock(qp);
974 }
975 self->ul_writer = 0;
976 self->ul_critical--;
977 }
978
979 /*
980 * Common code for calling the the ___lwp_mutex_timedlock() system call.
981 * Returns with mutex_owner and mutex_ownerpid set correctly.
982 */
983 static int
mutex_lock_kernel(mutex_t * mp,timespec_t * tsp,tdb_mutex_stats_t * msp)984 mutex_lock_kernel(mutex_t *mp, timespec_t *tsp, tdb_mutex_stats_t *msp)
985 {
986 ulwp_t *self = curthread;
987 uberdata_t *udp = self->ul_uberdata;
988 int mtype = mp->mutex_type;
989 hrtime_t begin_sleep;
990 int acquired;
991 int error;
992
993 self->ul_sp = stkptr();
994 self->ul_wchan = mp;
995 if (__td_event_report(self, TD_SLEEP, udp)) {
996 self->ul_td_evbuf.eventnum = TD_SLEEP;
997 self->ul_td_evbuf.eventdata = mp;
998 tdb_event(TD_SLEEP, udp);
999 }
1000 if (msp) {
1001 tdb_incr(msp->mutex_sleep);
1002 begin_sleep = gethrtime();
1003 }
1004
1005 DTRACE_PROBE1(plockstat, mutex__block, mp);
1006
1007 for (;;) {
1008 /*
1009 * A return value of EOWNERDEAD or ELOCKUNMAPPED
1010 * means we successfully acquired the lock.
1011 */
1012 if ((error = ___lwp_mutex_timedlock(mp, tsp, self)) != 0 &&
1013 error != EOWNERDEAD && error != ELOCKUNMAPPED) {
1014 acquired = 0;
1015 break;
1016 }
1017
1018 if (mtype & USYNC_PROCESS) {
1019 /*
1020 * Defend against forkall(). We may be the child,
1021 * in which case we don't actually own the mutex.
1022 */
1023 enter_critical(self);
1024 if (mp->mutex_ownerpid == udp->pid) {
1025 exit_critical(self);
1026 acquired = 1;
1027 break;
1028 }
1029 exit_critical(self);
1030 } else {
1031 acquired = 1;
1032 break;
1033 }
1034 }
1035
1036 if (msp)
1037 msp->mutex_sleep_time += gethrtime() - begin_sleep;
1038 self->ul_wchan = NULL;
1039 self->ul_sp = 0;
1040
1041 if (acquired) {
1042 ASSERT(mp->mutex_owner == (uintptr_t)self);
1043 DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1044 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1045 } else {
1046 DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1047 DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1048 }
1049
1050 return (error);
1051 }
1052
1053 /*
1054 * Common code for calling the ___lwp_mutex_trylock() system call.
1055 * Returns with mutex_owner and mutex_ownerpid set correctly.
1056 */
1057 int
mutex_trylock_kernel(mutex_t * mp)1058 mutex_trylock_kernel(mutex_t *mp)
1059 {
1060 ulwp_t *self = curthread;
1061 uberdata_t *udp = self->ul_uberdata;
1062 int mtype = mp->mutex_type;
1063 int error;
1064 int acquired;
1065
1066 for (;;) {
1067 /*
1068 * A return value of EOWNERDEAD or ELOCKUNMAPPED
1069 * means we successfully acquired the lock.
1070 */
1071 if ((error = ___lwp_mutex_trylock(mp, self)) != 0 &&
1072 error != EOWNERDEAD && error != ELOCKUNMAPPED) {
1073 acquired = 0;
1074 break;
1075 }
1076
1077 if (mtype & USYNC_PROCESS) {
1078 /*
1079 * Defend against forkall(). We may be the child,
1080 * in which case we don't actually own the mutex.
1081 */
1082 enter_critical(self);
1083 if (mp->mutex_ownerpid == udp->pid) {
1084 exit_critical(self);
1085 acquired = 1;
1086 break;
1087 }
1088 exit_critical(self);
1089 } else {
1090 acquired = 1;
1091 break;
1092 }
1093 }
1094
1095 if (acquired) {
1096 ASSERT(mp->mutex_owner == (uintptr_t)self);
1097 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1098 } else if (error != EBUSY) {
1099 DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1100 }
1101
1102 return (error);
1103 }
1104
1105 volatile sc_shared_t *
setup_schedctl(void)1106 setup_schedctl(void)
1107 {
1108 ulwp_t *self = curthread;
1109 volatile sc_shared_t *scp;
1110 sc_shared_t *tmp;
1111
1112 if ((scp = self->ul_schedctl) == NULL && /* no shared state yet */
1113 !self->ul_vfork && /* not a child of vfork() */
1114 !self->ul_schedctl_called) { /* haven't been called before */
1115 enter_critical(self);
1116 self->ul_schedctl_called = &self->ul_uberdata->uberflags;
1117 if ((tmp = __schedctl()) != (sc_shared_t *)(-1))
1118 self->ul_schedctl = scp = tmp;
1119 exit_critical(self);
1120 }
1121 /*
1122 * Unless the call to setup_schedctl() is surrounded
1123 * by enter_critical()/exit_critical(), the address
1124 * we are returning could be invalid due to a forkall()
1125 * having occurred in another thread.
1126 */
1127 return (scp);
1128 }
1129
1130 /*
1131 * Interfaces from libsched, incorporated into libc.
1132 * libsched.so.1 is now a filter library onto libc.
1133 */
1134 #pragma weak schedctl_lookup = schedctl_init
1135 schedctl_t *
schedctl_init(void)1136 schedctl_init(void)
1137 {
1138 volatile sc_shared_t *scp = setup_schedctl();
1139 return ((scp == NULL)? NULL : (schedctl_t *)&scp->sc_preemptctl);
1140 }
1141
1142 void
schedctl_exit(void)1143 schedctl_exit(void)
1144 {
1145 }
1146
1147 /*
1148 * Contract private interface for java.
1149 * Set up the schedctl data if it doesn't exist yet.
1150 * Return a pointer to the pointer to the schedctl data.
1151 */
1152 volatile sc_shared_t *volatile *
_thr_schedctl(void)1153 _thr_schedctl(void)
1154 {
1155 ulwp_t *self = curthread;
1156 volatile sc_shared_t *volatile *ptr;
1157
1158 if (self->ul_vfork)
1159 return (NULL);
1160 if (*(ptr = &self->ul_schedctl) == NULL)
1161 (void) setup_schedctl();
1162 return (ptr);
1163 }
1164
1165 /*
1166 * Block signals and attempt to block preemption.
1167 * no_preempt()/preempt() must be used in pairs but can be nested.
1168 */
1169 void
no_preempt(ulwp_t * self)1170 no_preempt(ulwp_t *self)
1171 {
1172 volatile sc_shared_t *scp;
1173
1174 if (self->ul_preempt++ == 0) {
1175 enter_critical(self);
1176 if ((scp = self->ul_schedctl) != NULL ||
1177 (scp = setup_schedctl()) != NULL) {
1178 /*
1179 * Save the pre-existing preempt value.
1180 */
1181 self->ul_savpreempt = scp->sc_preemptctl.sc_nopreempt;
1182 scp->sc_preemptctl.sc_nopreempt = 1;
1183 }
1184 }
1185 }
1186
1187 /*
1188 * Undo the effects of no_preempt().
1189 */
1190 void
preempt(ulwp_t * self)1191 preempt(ulwp_t *self)
1192 {
1193 volatile sc_shared_t *scp;
1194
1195 ASSERT(self->ul_preempt > 0);
1196 if (--self->ul_preempt == 0) {
1197 if ((scp = self->ul_schedctl) != NULL) {
1198 /*
1199 * Restore the pre-existing preempt value.
1200 */
1201 scp->sc_preemptctl.sc_nopreempt = self->ul_savpreempt;
1202 if (scp->sc_preemptctl.sc_yield &&
1203 scp->sc_preemptctl.sc_nopreempt == 0) {
1204 yield();
1205 if (scp->sc_preemptctl.sc_yield) {
1206 /*
1207 * Shouldn't happen. This is either
1208 * a race condition or the thread
1209 * just entered the real-time class.
1210 */
1211 yield();
1212 scp->sc_preemptctl.sc_yield = 0;
1213 }
1214 }
1215 }
1216 exit_critical(self);
1217 }
1218 }
1219
1220 /*
1221 * If a call to preempt() would cause the current thread to yield or to
1222 * take deferred actions in exit_critical(), then unpark the specified
1223 * lwp so it can run while we delay. Return the original lwpid if the
1224 * unpark was not performed, else return zero. The tests are a repeat
1225 * of some of the tests in preempt(), above. This is a statistical
1226 * optimization solely for cond_sleep_queue(), below.
1227 */
1228 static lwpid_t
preempt_unpark(ulwp_t * self,lwpid_t lwpid)1229 preempt_unpark(ulwp_t *self, lwpid_t lwpid)
1230 {
1231 volatile sc_shared_t *scp = self->ul_schedctl;
1232
1233 ASSERT(self->ul_preempt == 1 && self->ul_critical > 0);
1234 if ((scp != NULL && scp->sc_preemptctl.sc_yield) ||
1235 (self->ul_curplease && self->ul_critical == 1)) {
1236 (void) __lwp_unpark(lwpid);
1237 lwpid = 0;
1238 }
1239 return (lwpid);
1240 }
1241
1242 /*
1243 * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1244 * If this fails, return EBUSY and let the caller deal with it.
1245 * If this succeeds, return 0 with mutex_owner set to curthread.
1246 */
1247 static int
mutex_trylock_adaptive(mutex_t * mp,int tryhard)1248 mutex_trylock_adaptive(mutex_t *mp, int tryhard)
1249 {
1250 ulwp_t *self = curthread;
1251 int error = EBUSY;
1252 ulwp_t *ulwp;
1253 volatile sc_shared_t *scp;
1254 volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
1255 volatile uint64_t *ownerp = (volatile uint64_t *)&mp->mutex_owner;
1256 uint32_t new_lockword;
1257 int count = 0;
1258 int max_count;
1259 uint8_t max_spinners;
1260
1261 ASSERT(!(mp->mutex_type & USYNC_PROCESS));
1262
1263 if (MUTEX_OWNED(mp, self))
1264 return (EBUSY);
1265
1266 enter_critical(self);
1267
1268 /* short-cut, not definitive (see below) */
1269 if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1270 ASSERT(mp->mutex_type & LOCK_ROBUST);
1271 error = ENOTRECOVERABLE;
1272 goto done;
1273 }
1274
1275 /*
1276 * Make one attempt to acquire the lock before
1277 * incurring the overhead of the spin loop.
1278 */
1279 if (set_lock_byte(lockp) == 0) {
1280 *ownerp = (uintptr_t)self;
1281 error = 0;
1282 goto done;
1283 }
1284 if (!tryhard)
1285 goto done;
1286 if (ncpus == 0)
1287 ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1288 if ((max_spinners = self->ul_max_spinners) >= ncpus)
1289 max_spinners = ncpus - 1;
1290 max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1291 if (max_count == 0)
1292 goto done;
1293
1294 /*
1295 * This spin loop is unfair to lwps that have already dropped into
1296 * the kernel to sleep. They will starve on a highly-contended mutex.
1297 * This is just too bad. The adaptive spin algorithm is intended
1298 * to allow programs with highly-contended locks (that is, broken
1299 * programs) to execute with reasonable speed despite their contention.
1300 * Being fair would reduce the speed of such programs and well-written
1301 * programs will not suffer in any case.
1302 */
1303 if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1)
1304 goto done;
1305 DTRACE_PROBE1(plockstat, mutex__spin, mp);
1306 for (count = 1; ; count++) {
1307 if (*lockp == 0 && set_lock_byte(lockp) == 0) {
1308 *ownerp = (uintptr_t)self;
1309 error = 0;
1310 break;
1311 }
1312 if (count == max_count)
1313 break;
1314 SMT_PAUSE();
1315 /*
1316 * Stop spinning if the mutex owner is not running on
1317 * a processor; it will not drop the lock any time soon
1318 * and we would just be wasting time to keep spinning.
1319 *
1320 * Note that we are looking at another thread (ulwp_t)
1321 * without ensuring that the other thread does not exit.
1322 * The scheme relies on ulwp_t structures never being
1323 * deallocated by the library (the library employs a free
1324 * list of ulwp_t structs that are reused when new threads
1325 * are created) and on schedctl shared memory never being
1326 * deallocated once created via __schedctl().
1327 *
1328 * Thus, the worst that can happen when the spinning thread
1329 * looks at the owner's schedctl data is that it is looking
1330 * at some other thread's schedctl data. This almost never
1331 * happens and is benign when it does.
1332 */
1333 if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1334 ((scp = ulwp->ul_schedctl) == NULL ||
1335 scp->sc_state != SC_ONPROC))
1336 break;
1337 }
1338 new_lockword = spinners_decr(&mp->mutex_lockword);
1339 if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1340 /*
1341 * We haven't yet acquired the lock, the lock
1342 * is free, and there are no other spinners.
1343 * Make one final attempt to acquire the lock.
1344 *
1345 * This isn't strictly necessary since mutex_lock_queue()
1346 * (the next action this thread will take if it doesn't
1347 * acquire the lock here) makes one attempt to acquire
1348 * the lock before putting the thread to sleep.
1349 *
1350 * If the next action for this thread (on failure here)
1351 * were not to call mutex_lock_queue(), this would be
1352 * necessary for correctness, to avoid ending up with an
1353 * unheld mutex with waiters but no one to wake them up.
1354 */
1355 if (set_lock_byte(lockp) == 0) {
1356 *ownerp = (uintptr_t)self;
1357 error = 0;
1358 }
1359 count++;
1360 }
1361
1362 done:
1363 if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1364 ASSERT(mp->mutex_type & LOCK_ROBUST);
1365 /*
1366 * We shouldn't own the mutex.
1367 * Just clear the lock; everyone has already been waked up.
1368 */
1369 *ownerp = 0;
1370 (void) clear_lockbyte(&mp->mutex_lockword);
1371 error = ENOTRECOVERABLE;
1372 }
1373
1374 exit_critical(self);
1375
1376 if (error) {
1377 if (count) {
1378 DTRACE_PROBE3(plockstat, mutex__spun, mp, 0, count);
1379 }
1380 if (error != EBUSY) {
1381 DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1382 }
1383 } else {
1384 if (count) {
1385 DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
1386 }
1387 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1388 if (mp->mutex_flag & LOCK_OWNERDEAD) {
1389 ASSERT(mp->mutex_type & LOCK_ROBUST);
1390 error = EOWNERDEAD;
1391 }
1392 }
1393
1394 return (error);
1395 }
1396
1397 /*
1398 * Same as mutex_trylock_adaptive(), except specifically for queue locks.
1399 * The owner field is not set here; the caller (spin_lock_set()) sets it.
1400 */
1401 static int
mutex_queuelock_adaptive(mutex_t * mp)1402 mutex_queuelock_adaptive(mutex_t *mp)
1403 {
1404 ulwp_t *ulwp;
1405 volatile sc_shared_t *scp;
1406 volatile uint8_t *lockp;
1407 volatile uint64_t *ownerp;
1408 int count = curthread->ul_queue_spin;
1409
1410 ASSERT(mp->mutex_type == USYNC_THREAD);
1411
1412 if (count == 0)
1413 return (EBUSY);
1414
1415 lockp = (volatile uint8_t *)&mp->mutex_lockw;
1416 ownerp = (volatile uint64_t *)&mp->mutex_owner;
1417 while (--count >= 0) {
1418 if (*lockp == 0 && set_lock_byte(lockp) == 0)
1419 return (0);
1420 SMT_PAUSE();
1421 if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1422 ((scp = ulwp->ul_schedctl) == NULL ||
1423 scp->sc_state != SC_ONPROC))
1424 break;
1425 }
1426
1427 return (EBUSY);
1428 }
1429
1430 /*
1431 * Like mutex_trylock_adaptive(), but for process-shared mutexes.
1432 * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1433 * If this fails, return EBUSY and let the caller deal with it.
1434 * If this succeeds, return 0 with mutex_owner set to curthread
1435 * and mutex_ownerpid set to the current pid.
1436 */
1437 static int
mutex_trylock_process(mutex_t * mp,int tryhard)1438 mutex_trylock_process(mutex_t *mp, int tryhard)
1439 {
1440 ulwp_t *self = curthread;
1441 uberdata_t *udp = self->ul_uberdata;
1442 int error = EBUSY;
1443 volatile uint64_t *lockp = (volatile uint64_t *)&mp->mutex_lockword64;
1444 uint32_t new_lockword;
1445 int count = 0;
1446 int max_count;
1447 uint8_t max_spinners;
1448
1449 #if defined(__sparc) && !defined(_LP64)
1450 /* horrible hack, necessary only on 32-bit sparc */
1451 int fix_alignment_problem =
1452 (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
1453 self->ul_misaligned && !(mp->mutex_type & LOCK_ROBUST));
1454 #endif
1455
1456 ASSERT(mp->mutex_type & USYNC_PROCESS);
1457
1458 if (shared_mutex_held(mp))
1459 return (EBUSY);
1460
1461 enter_critical(self);
1462
1463 /* short-cut, not definitive (see below) */
1464 if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1465 ASSERT(mp->mutex_type & LOCK_ROBUST);
1466 error = ENOTRECOVERABLE;
1467 goto done;
1468 }
1469
1470 /*
1471 * Make one attempt to acquire the lock before
1472 * incurring the overhead of the spin loop.
1473 */
1474 #if defined(__sparc) && !defined(_LP64)
1475 /* horrible hack, necessary only on 32-bit sparc */
1476 if (fix_alignment_problem) {
1477 if (set_lock_byte(&mp->mutex_lockw) == 0) {
1478 mp->mutex_ownerpid = udp->pid;
1479 mp->mutex_owner = (uintptr_t)self;
1480 error = 0;
1481 goto done;
1482 }
1483 } else
1484 #endif
1485 if (set_lock_byte64(lockp, udp->pid) == 0) {
1486 mp->mutex_owner = (uintptr_t)self;
1487 /* mp->mutex_ownerpid was set by set_lock_byte64() */
1488 error = 0;
1489 goto done;
1490 }
1491 if (!tryhard)
1492 goto done;
1493 if (ncpus == 0)
1494 ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1495 if ((max_spinners = self->ul_max_spinners) >= ncpus)
1496 max_spinners = ncpus - 1;
1497 max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1498 if (max_count == 0)
1499 goto done;
1500
1501 /*
1502 * This is a process-shared mutex.
1503 * We cannot know if the owner is running on a processor.
1504 * We just spin and hope that it is on a processor.
1505 */
1506 if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1)
1507 goto done;
1508 DTRACE_PROBE1(plockstat, mutex__spin, mp);
1509 for (count = 1; ; count++) {
1510 #if defined(__sparc) && !defined(_LP64)
1511 /* horrible hack, necessary only on 32-bit sparc */
1512 if (fix_alignment_problem) {
1513 if ((*lockp & LOCKMASK64) == 0 &&
1514 set_lock_byte(&mp->mutex_lockw) == 0) {
1515 mp->mutex_ownerpid = udp->pid;
1516 mp->mutex_owner = (uintptr_t)self;
1517 error = 0;
1518 break;
1519 }
1520 } else
1521 #endif
1522 if ((*lockp & LOCKMASK64) == 0 &&
1523 set_lock_byte64(lockp, udp->pid) == 0) {
1524 mp->mutex_owner = (uintptr_t)self;
1525 /* mp->mutex_ownerpid was set by set_lock_byte64() */
1526 error = 0;
1527 break;
1528 }
1529 if (count == max_count)
1530 break;
1531 SMT_PAUSE();
1532 }
1533 new_lockword = spinners_decr(&mp->mutex_lockword);
1534 if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1535 /*
1536 * We haven't yet acquired the lock, the lock
1537 * is free, and there are no other spinners.
1538 * Make one final attempt to acquire the lock.
1539 *
1540 * This isn't strictly necessary since mutex_lock_kernel()
1541 * (the next action this thread will take if it doesn't
1542 * acquire the lock here) makes one attempt to acquire
1543 * the lock before putting the thread to sleep.
1544 *
1545 * If the next action for this thread (on failure here)
1546 * were not to call mutex_lock_kernel(), this would be
1547 * necessary for correctness, to avoid ending up with an
1548 * unheld mutex with waiters but no one to wake them up.
1549 */
1550 #if defined(__sparc) && !defined(_LP64)
1551 /* horrible hack, necessary only on 32-bit sparc */
1552 if (fix_alignment_problem) {
1553 if (set_lock_byte(&mp->mutex_lockw) == 0) {
1554 mp->mutex_ownerpid = udp->pid;
1555 mp->mutex_owner = (uintptr_t)self;
1556 error = 0;
1557 }
1558 } else
1559 #endif
1560 if (set_lock_byte64(lockp, udp->pid) == 0) {
1561 mp->mutex_owner = (uintptr_t)self;
1562 /* mp->mutex_ownerpid was set by set_lock_byte64() */
1563 error = 0;
1564 }
1565 count++;
1566 }
1567
1568 done:
1569 if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1570 ASSERT(mp->mutex_type & LOCK_ROBUST);
1571 /*
1572 * We shouldn't own the mutex.
1573 * Just clear the lock; everyone has already been waked up.
1574 */
1575 mp->mutex_owner = 0;
1576 /* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
1577 (void) clear_lockbyte64(&mp->mutex_lockword64);
1578 error = ENOTRECOVERABLE;
1579 }
1580
1581 exit_critical(self);
1582
1583 if (error) {
1584 if (count) {
1585 DTRACE_PROBE3(plockstat, mutex__spun, mp, 0, count);
1586 }
1587 if (error != EBUSY) {
1588 DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1589 }
1590 } else {
1591 if (count) {
1592 DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
1593 }
1594 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1595 if (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1596 ASSERT(mp->mutex_type & LOCK_ROBUST);
1597 if (mp->mutex_flag & LOCK_OWNERDEAD)
1598 error = EOWNERDEAD;
1599 else if (mp->mutex_type & USYNC_PROCESS_ROBUST)
1600 error = ELOCKUNMAPPED;
1601 else
1602 error = EOWNERDEAD;
1603 }
1604 }
1605
1606 return (error);
1607 }
1608
1609 /*
1610 * Mutex wakeup code for releasing a USYNC_THREAD mutex.
1611 * Returns the lwpid of the thread that was dequeued, if any.
1612 * The caller of mutex_wakeup() must call __lwp_unpark(lwpid)
1613 * to wake up the specified lwp.
1614 */
1615 static lwpid_t
mutex_wakeup(mutex_t * mp)1616 mutex_wakeup(mutex_t *mp)
1617 {
1618 lwpid_t lwpid = 0;
1619 int more;
1620 queue_head_t *qp;
1621 ulwp_t *ulwp;
1622
1623 /*
1624 * Dequeue a waiter from the sleep queue. Don't touch the mutex
1625 * waiters bit if no one was found on the queue because the mutex
1626 * might have been deallocated or reallocated for another purpose.
1627 */
1628 qp = queue_lock(mp, MX);
1629 if ((ulwp = dequeue(qp, &more)) != NULL) {
1630 lwpid = ulwp->ul_lwpid;
1631 mp->mutex_waiters = more;
1632 }
1633 queue_unlock(qp);
1634 return (lwpid);
1635 }
1636
1637 /*
1638 * Mutex wakeup code for releasing all waiters on a USYNC_THREAD mutex.
1639 */
1640 static void
mutex_wakeup_all(mutex_t * mp)1641 mutex_wakeup_all(mutex_t *mp)
1642 {
1643 queue_head_t *qp;
1644 queue_root_t *qrp;
1645 int nlwpid = 0;
1646 int maxlwps = MAXLWPS;
1647 ulwp_t *ulwp;
1648 lwpid_t buffer[MAXLWPS];
1649 lwpid_t *lwpid = buffer;
1650
1651 /*
1652 * Walk the list of waiters and prepare to wake up all of them.
1653 * The waiters flag has already been cleared from the mutex.
1654 *
1655 * We keep track of lwpids that are to be unparked in lwpid[].
1656 * __lwp_unpark_all() is called to unpark all of them after
1657 * they have been removed from the sleep queue and the sleep
1658 * queue lock has been dropped. If we run out of space in our
1659 * on-stack buffer, we need to allocate more but we can't call
1660 * lmalloc() because we are holding a queue lock when the overflow
1661 * occurs and lmalloc() acquires a lock. We can't use alloca()
1662 * either because the application may have allocated a small
1663 * stack and we don't want to overrun the stack. So we call
1664 * alloc_lwpids() to allocate a bigger buffer using the mmap()
1665 * system call directly since that path acquires no locks.
1666 */
1667 qp = queue_lock(mp, MX);
1668 for (;;) {
1669 if ((qrp = qp->qh_root) == NULL ||
1670 (ulwp = qrp->qr_head) == NULL)
1671 break;
1672 ASSERT(ulwp->ul_wchan == mp);
1673 queue_unlink(qp, &qrp->qr_head, NULL);
1674 ulwp->ul_sleepq = NULL;
1675 ulwp->ul_wchan = NULL;
1676 if (nlwpid == maxlwps)
1677 lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
1678 lwpid[nlwpid++] = ulwp->ul_lwpid;
1679 }
1680
1681 if (nlwpid == 0) {
1682 queue_unlock(qp);
1683 } else {
1684 mp->mutex_waiters = 0;
1685 no_preempt(curthread);
1686 queue_unlock(qp);
1687 if (nlwpid == 1)
1688 (void) __lwp_unpark(lwpid[0]);
1689 else
1690 (void) __lwp_unpark_all(lwpid, nlwpid);
1691 preempt(curthread);
1692 }
1693
1694 if (lwpid != buffer)
1695 (void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
1696 }
1697
1698 /*
1699 * Release a process-private mutex.
1700 * As an optimization, if there are waiters but there are also spinners
1701 * attempting to acquire the mutex, then don't bother waking up a waiter;
1702 * one of the spinners will acquire the mutex soon and it would be a waste
1703 * of resources to wake up some thread just to have it spin for a while
1704 * and then possibly go back to sleep. See mutex_trylock_adaptive().
1705 */
1706 static lwpid_t
mutex_unlock_queue(mutex_t * mp,int release_all)1707 mutex_unlock_queue(mutex_t *mp, int release_all)
1708 {
1709 ulwp_t *self = curthread;
1710 lwpid_t lwpid = 0;
1711 uint32_t old_lockword;
1712
1713 DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1714 sigoff(self);
1715 mp->mutex_owner = 0;
1716 old_lockword = clear_lockbyte(&mp->mutex_lockword);
1717 if ((old_lockword & WAITERMASK) &&
1718 (release_all || (old_lockword & SPINNERMASK) == 0)) {
1719 no_preempt(self); /* ensure a prompt wakeup */
1720 if (release_all)
1721 mutex_wakeup_all(mp);
1722 else
1723 lwpid = mutex_wakeup(mp);
1724 if (lwpid == 0)
1725 preempt(self);
1726 }
1727 sigon(self);
1728 return (lwpid);
1729 }
1730
1731 /*
1732 * Like mutex_unlock_queue(), but for process-shared mutexes.
1733 */
1734 static void
mutex_unlock_process(mutex_t * mp,int release_all)1735 mutex_unlock_process(mutex_t *mp, int release_all)
1736 {
1737 ulwp_t *self = curthread;
1738 uint64_t old_lockword64;
1739
1740 DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1741 sigoff(self);
1742 mp->mutex_owner = 0;
1743 #if defined(__sparc) && !defined(_LP64)
1744 /* horrible hack, necessary only on 32-bit sparc */
1745 if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
1746 self->ul_misaligned && !(mp->mutex_type & LOCK_ROBUST)) {
1747 uint32_t old_lockword;
1748 mp->mutex_ownerpid = 0;
1749 old_lockword = clear_lockbyte(&mp->mutex_lockword);
1750 if ((old_lockword & WAITERMASK) &&
1751 (release_all || (old_lockword & SPINNERMASK) == 0)) {
1752 no_preempt(self); /* ensure a prompt wakeup */
1753 (void) ___lwp_mutex_wakeup(mp, release_all);
1754 preempt(self);
1755 }
1756 sigon(self);
1757 return;
1758 }
1759 #endif
1760 /* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
1761 old_lockword64 = clear_lockbyte64(&mp->mutex_lockword64);
1762 if ((old_lockword64 & WAITERMASK64) &&
1763 (release_all || (old_lockword64 & SPINNERMASK64) == 0)) {
1764 no_preempt(self); /* ensure a prompt wakeup */
1765 (void) ___lwp_mutex_wakeup(mp, release_all);
1766 preempt(self);
1767 }
1768 sigon(self);
1769 }
1770
1771 void
stall(void)1772 stall(void)
1773 {
1774 for (;;)
1775 (void) mutex_lock_kernel(&stall_mutex, NULL, NULL);
1776 }
1777
1778 /*
1779 * Acquire a USYNC_THREAD mutex via user-level sleep queues.
1780 * We failed set_lock_byte(&mp->mutex_lockw) before coming here.
1781 * If successful, returns with mutex_owner set correctly.
1782 */
1783 int
mutex_lock_queue(ulwp_t * self,tdb_mutex_stats_t * msp,mutex_t * mp,timespec_t * tsp)1784 mutex_lock_queue(ulwp_t *self, tdb_mutex_stats_t *msp, mutex_t *mp,
1785 timespec_t *tsp)
1786 {
1787 uberdata_t *udp = curthread->ul_uberdata;
1788 queue_head_t *qp;
1789 hrtime_t begin_sleep;
1790 int error = 0;
1791
1792 self->ul_sp = stkptr();
1793 if (__td_event_report(self, TD_SLEEP, udp)) {
1794 self->ul_wchan = mp;
1795 self->ul_td_evbuf.eventnum = TD_SLEEP;
1796 self->ul_td_evbuf.eventdata = mp;
1797 tdb_event(TD_SLEEP, udp);
1798 }
1799 if (msp) {
1800 tdb_incr(msp->mutex_sleep);
1801 begin_sleep = gethrtime();
1802 }
1803
1804 DTRACE_PROBE1(plockstat, mutex__block, mp);
1805
1806 /*
1807 * Put ourself on the sleep queue, and while we are
1808 * unable to grab the lock, go park in the kernel.
1809 * Take ourself off the sleep queue after we acquire the lock.
1810 * The waiter bit can be set/cleared only while holding the queue lock.
1811 */
1812 qp = queue_lock(mp, MX);
1813 enqueue(qp, self, 0);
1814 mp->mutex_waiters = 1;
1815 for (;;) {
1816 if (set_lock_byte(&mp->mutex_lockw) == 0) {
1817 mp->mutex_owner = (uintptr_t)self;
1818 mp->mutex_waiters = dequeue_self(qp);
1819 break;
1820 }
1821 set_parking_flag(self, 1);
1822 queue_unlock(qp);
1823 /*
1824 * __lwp_park() will return the residual time in tsp
1825 * if we are unparked before the timeout expires.
1826 */
1827 error = __lwp_park(tsp, 0);
1828 set_parking_flag(self, 0);
1829 /*
1830 * We could have taken a signal or suspended ourself.
1831 * If we did, then we removed ourself from the queue.
1832 * Someone else may have removed us from the queue
1833 * as a consequence of mutex_unlock(). We may have
1834 * gotten a timeout from __lwp_park(). Or we may still
1835 * be on the queue and this is just a spurious wakeup.
1836 */
1837 qp = queue_lock(mp, MX);
1838 if (self->ul_sleepq == NULL) {
1839 if (error) {
1840 mp->mutex_waiters = queue_waiter(qp)? 1 : 0;
1841 if (error != EINTR)
1842 break;
1843 error = 0;
1844 }
1845 if (set_lock_byte(&mp->mutex_lockw) == 0) {
1846 mp->mutex_owner = (uintptr_t)self;
1847 break;
1848 }
1849 enqueue(qp, self, 0);
1850 mp->mutex_waiters = 1;
1851 }
1852 ASSERT(self->ul_sleepq == qp &&
1853 self->ul_qtype == MX &&
1854 self->ul_wchan == mp);
1855 if (error) {
1856 if (error != EINTR) {
1857 mp->mutex_waiters = dequeue_self(qp);
1858 break;
1859 }
1860 error = 0;
1861 }
1862 }
1863 ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
1864 self->ul_wchan == NULL);
1865 self->ul_sp = 0;
1866
1867 ASSERT(error == 0 || error == EINVAL || error == ETIME);
1868
1869 if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1870 ASSERT(mp->mutex_type & LOCK_ROBUST);
1871 /*
1872 * We shouldn't own the mutex.
1873 * Just clear the lock; everyone has already been waked up.
1874 */
1875 mp->mutex_owner = 0;
1876 (void) clear_lockbyte(&mp->mutex_lockword);
1877 error = ENOTRECOVERABLE;
1878 }
1879
1880 queue_unlock(qp);
1881
1882 if (msp)
1883 msp->mutex_sleep_time += gethrtime() - begin_sleep;
1884
1885 if (error) {
1886 DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1887 DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1888 } else {
1889 DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1890 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1891 if (mp->mutex_flag & LOCK_OWNERDEAD) {
1892 ASSERT(mp->mutex_type & LOCK_ROBUST);
1893 error = EOWNERDEAD;
1894 }
1895 }
1896
1897 return (error);
1898 }
1899
1900 static int
mutex_recursion(mutex_t * mp,int mtype,int try)1901 mutex_recursion(mutex_t *mp, int mtype, int try)
1902 {
1903 ASSERT(mutex_held(mp));
1904 ASSERT(mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK));
1905 ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
1906
1907 if (mtype & LOCK_RECURSIVE) {
1908 if (mp->mutex_rcount == RECURSION_MAX) {
1909 DTRACE_PROBE2(plockstat, mutex__error, mp, EAGAIN);
1910 return (EAGAIN);
1911 }
1912 mp->mutex_rcount++;
1913 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 1, 0);
1914 return (0);
1915 }
1916 if (try == MUTEX_LOCK) {
1917 DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
1918 return (EDEADLK);
1919 }
1920 return (EBUSY);
1921 }
1922
1923 /*
1924 * Register this USYNC_PROCESS|LOCK_ROBUST mutex with the kernel so
1925 * it can apply LOCK_OWNERDEAD|LOCK_UNMAPPED if it becomes necessary.
1926 * We use tdb_hash_lock here and in the synch object tracking code in
1927 * the tdb_agent.c file. There is no conflict between these two usages.
1928 */
1929 void
register_lock(mutex_t * mp)1930 register_lock(mutex_t *mp)
1931 {
1932 uberdata_t *udp = curthread->ul_uberdata;
1933 uint_t hash = LOCK_HASH(mp);
1934 robust_t *rlp;
1935 robust_t *invalid;
1936 robust_t **rlpp;
1937 robust_t **table;
1938
1939 if ((table = udp->robustlocks) == NULL) {
1940 lmutex_lock(&udp->tdb_hash_lock);
1941 if ((table = udp->robustlocks) == NULL) {
1942 table = lmalloc(LOCKHASHSZ * sizeof (robust_t *));
1943 membar_producer();
1944 udp->robustlocks = table;
1945 }
1946 lmutex_unlock(&udp->tdb_hash_lock);
1947 }
1948 membar_consumer();
1949
1950 /*
1951 * First search the registered table with no locks held.
1952 * This is safe because the table never shrinks
1953 * and we can only get a false negative.
1954 */
1955 for (rlp = table[hash]; rlp != NULL; rlp = rlp->robust_next) {
1956 if (rlp->robust_lock == mp) /* already registered */
1957 return;
1958 }
1959
1960 /*
1961 * The lock was not found.
1962 * Repeat the operation with tdb_hash_lock held.
1963 */
1964 lmutex_lock(&udp->tdb_hash_lock);
1965
1966 invalid = NULL;
1967 for (rlpp = &table[hash];
1968 (rlp = *rlpp) != NULL;
1969 rlpp = &rlp->robust_next) {
1970 if (rlp->robust_lock == mp) { /* already registered */
1971 lmutex_unlock(&udp->tdb_hash_lock);
1972 return;
1973 }
1974 /* remember the first invalid entry, if any */
1975 if (rlp->robust_lock == INVALID_ADDR && invalid == NULL)
1976 invalid = rlp;
1977 }
1978
1979 /*
1980 * The lock has never been registered.
1981 * Add it to the table and register it now.
1982 */
1983 if ((rlp = invalid) != NULL) {
1984 /*
1985 * Reuse the invalid entry we found above.
1986 * The linkages are still correct.
1987 */
1988 rlp->robust_lock = mp;
1989 membar_producer();
1990 } else {
1991 /*
1992 * Allocate a new entry and add it to
1993 * the hash table and to the global list.
1994 */
1995 rlp = lmalloc(sizeof (*rlp));
1996 rlp->robust_lock = mp;
1997 rlp->robust_next = NULL;
1998 rlp->robust_list = udp->robustlist;
1999 udp->robustlist = rlp;
2000 membar_producer();
2001 *rlpp = rlp;
2002 }
2003
2004 lmutex_unlock(&udp->tdb_hash_lock);
2005
2006 (void) ___lwp_mutex_register(mp, &rlp->robust_lock);
2007 }
2008
2009 /*
2010 * This is called in the child of fork()/forkall() to start over
2011 * with a clean slate. (Each process must register its own locks.)
2012 * No locks are needed because all other threads are suspended or gone.
2013 */
2014 void
unregister_locks(void)2015 unregister_locks(void)
2016 {
2017 uberdata_t *udp = curthread->ul_uberdata;
2018 robust_t **table;
2019 robust_t *rlp;
2020 robust_t *next;
2021
2022 /*
2023 * Do this first, before calling lfree().
2024 */
2025 table = udp->robustlocks;
2026 udp->robustlocks = NULL;
2027 rlp = udp->robustlist;
2028 udp->robustlist = NULL;
2029
2030 /*
2031 * Do this by traversing the global list, not the hash table.
2032 */
2033 while (rlp != NULL) {
2034 next = rlp->robust_list;
2035 lfree(rlp, sizeof (*rlp));
2036 rlp = next;
2037 }
2038 if (table != NULL)
2039 lfree(table, LOCKHASHSZ * sizeof (robust_t *));
2040 }
2041
2042 /*
2043 * Returns with mutex_owner set correctly.
2044 */
2045 int
mutex_lock_internal(mutex_t * mp,timespec_t * tsp,int try)2046 mutex_lock_internal(mutex_t *mp, timespec_t *tsp, int try)
2047 {
2048 ulwp_t *self = curthread;
2049 uberdata_t *udp = self->ul_uberdata;
2050 int mtype = mp->mutex_type;
2051 tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2052 int error = 0;
2053 int noceil = try & MUTEX_NOCEIL;
2054 uint8_t ceil;
2055 int myprio;
2056
2057 try &= ~MUTEX_NOCEIL;
2058 ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
2059
2060 if (!self->ul_schedctl_called)
2061 (void) setup_schedctl();
2062
2063 if (msp && try == MUTEX_TRY)
2064 tdb_incr(msp->mutex_try);
2065
2066 if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && mutex_held(mp))
2067 return (mutex_recursion(mp, mtype, try));
2068
2069 if (self->ul_error_detection && try == MUTEX_LOCK &&
2070 tsp == NULL && mutex_held(mp))
2071 lock_error(mp, "mutex_lock", NULL, NULL);
2072
2073 if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
2074 update_sched(self);
2075 if (self->ul_cid != self->ul_rtclassid) {
2076 DTRACE_PROBE2(plockstat, mutex__error, mp, EPERM);
2077 return (EPERM);
2078 }
2079 ceil = mp->mutex_ceiling;
2080 myprio = self->ul_epri? self->ul_epri : self->ul_pri;
2081 if (myprio > ceil) {
2082 DTRACE_PROBE2(plockstat, mutex__error, mp, EINVAL);
2083 return (EINVAL);
2084 }
2085 if ((error = _ceil_mylist_add(mp)) != 0) {
2086 DTRACE_PROBE2(plockstat, mutex__error, mp, error);
2087 return (error);
2088 }
2089 if (myprio < ceil)
2090 _ceil_prio_inherit(ceil);
2091 }
2092
2093 if ((mtype & (USYNC_PROCESS | LOCK_ROBUST))
2094 == (USYNC_PROCESS | LOCK_ROBUST))
2095 register_lock(mp);
2096
2097 if (mtype & LOCK_PRIO_INHERIT) {
2098 /* go straight to the kernel */
2099 if (try == MUTEX_TRY)
2100 error = mutex_trylock_kernel(mp);
2101 else /* MUTEX_LOCK */
2102 error = mutex_lock_kernel(mp, tsp, msp);
2103 /*
2104 * The kernel never sets or clears the lock byte
2105 * for LOCK_PRIO_INHERIT mutexes.
2106 * Set it here for consistency.
2107 */
2108 switch (error) {
2109 case 0:
2110 self->ul_pilocks++;
2111 mp->mutex_lockw = LOCKSET;
2112 break;
2113 case EOWNERDEAD:
2114 case ELOCKUNMAPPED:
2115 self->ul_pilocks++;
2116 mp->mutex_lockw = LOCKSET;
2117 /* FALLTHROUGH */
2118 case ENOTRECOVERABLE:
2119 ASSERT(mtype & LOCK_ROBUST);
2120 break;
2121 case EDEADLK:
2122 if (try == MUTEX_TRY) {
2123 error = EBUSY;
2124 } else if (tsp != NULL) { /* simulate a timeout */
2125 /*
2126 * Note: mutex_timedlock() never returns EINTR.
2127 */
2128 timespec_t ts = *tsp;
2129 timespec_t rts;
2130
2131 while (__nanosleep(&ts, &rts) == EINTR)
2132 ts = rts;
2133 error = ETIME;
2134 } else { /* simulate a deadlock */
2135 stall();
2136 }
2137 break;
2138 }
2139 } else if (mtype & USYNC_PROCESS) {
2140 error = mutex_trylock_process(mp, try == MUTEX_LOCK);
2141 if (error == EBUSY && try == MUTEX_LOCK)
2142 error = mutex_lock_kernel(mp, tsp, msp);
2143 } else { /* USYNC_THREAD */
2144 error = mutex_trylock_adaptive(mp, try == MUTEX_LOCK);
2145 if (error == EBUSY && try == MUTEX_LOCK)
2146 error = mutex_lock_queue(self, msp, mp, tsp);
2147 }
2148
2149 switch (error) {
2150 case 0:
2151 case EOWNERDEAD:
2152 case ELOCKUNMAPPED:
2153 if (mtype & LOCK_ROBUST)
2154 remember_lock(mp);
2155 if (msp)
2156 record_begin_hold(msp);
2157 break;
2158 default:
2159 if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
2160 (void) _ceil_mylist_del(mp);
2161 if (myprio < ceil)
2162 _ceil_prio_waive();
2163 }
2164 if (try == MUTEX_TRY) {
2165 if (msp)
2166 tdb_incr(msp->mutex_try_fail);
2167 if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2168 self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2169 tdb_event(TD_LOCK_TRY, udp);
2170 }
2171 }
2172 break;
2173 }
2174
2175 return (error);
2176 }
2177
2178 int
fast_process_lock(mutex_t * mp,timespec_t * tsp,int mtype,int try)2179 fast_process_lock(mutex_t *mp, timespec_t *tsp, int mtype, int try)
2180 {
2181 ulwp_t *self = curthread;
2182 uberdata_t *udp = self->ul_uberdata;
2183
2184 /*
2185 * We know that USYNC_PROCESS is set in mtype and that
2186 * zero, one, or both of the flags LOCK_RECURSIVE and
2187 * LOCK_ERRORCHECK are set, and that no other flags are set.
2188 */
2189 ASSERT((mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0);
2190 enter_critical(self);
2191 #if defined(__sparc) && !defined(_LP64)
2192 /* horrible hack, necessary only on 32-bit sparc */
2193 if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2194 self->ul_misaligned) {
2195 if (set_lock_byte(&mp->mutex_lockw) == 0) {
2196 mp->mutex_ownerpid = udp->pid;
2197 mp->mutex_owner = (uintptr_t)self;
2198 exit_critical(self);
2199 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2200 return (0);
2201 }
2202 } else
2203 #endif
2204 if (set_lock_byte64(&mp->mutex_lockword64, udp->pid) == 0) {
2205 mp->mutex_owner = (uintptr_t)self;
2206 /* mp->mutex_ownerpid was set by set_lock_byte64() */
2207 exit_critical(self);
2208 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2209 return (0);
2210 }
2211 exit_critical(self);
2212
2213 if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && shared_mutex_held(mp))
2214 return (mutex_recursion(mp, mtype, try));
2215
2216 if (try == MUTEX_LOCK) {
2217 if (mutex_trylock_process(mp, 1) == 0)
2218 return (0);
2219 return (mutex_lock_kernel(mp, tsp, NULL));
2220 }
2221
2222 if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2223 self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2224 tdb_event(TD_LOCK_TRY, udp);
2225 }
2226 return (EBUSY);
2227 }
2228
2229 static int
mutex_lock_impl(mutex_t * mp,timespec_t * tsp)2230 mutex_lock_impl(mutex_t *mp, timespec_t *tsp)
2231 {
2232 ulwp_t *self = curthread;
2233 int mtype = mp->mutex_type;
2234 uberflags_t *gflags;
2235
2236 if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2237 self->ul_error_detection && self->ul_misaligned == 0)
2238 lock_error(mp, "mutex_lock", NULL, "mutex is misaligned");
2239
2240 /*
2241 * Optimize the case of USYNC_THREAD, including
2242 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2243 * no error detection, no lock statistics,
2244 * and the process has only a single thread.
2245 * (Most likely a traditional single-threaded application.)
2246 */
2247 if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2248 self->ul_uberdata->uberflags.uf_all) == 0) {
2249 /*
2250 * Only one thread exists so we don't need an atomic operation.
2251 * We do, however, need to protect against signals.
2252 */
2253 if (mp->mutex_lockw == 0) {
2254 sigoff(self);
2255 mp->mutex_lockw = LOCKSET;
2256 mp->mutex_owner = (uintptr_t)self;
2257 sigon(self);
2258 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2259 return (0);
2260 }
2261 if (mtype && MUTEX_OWNER(mp) == self)
2262 return (mutex_recursion(mp, mtype, MUTEX_LOCK));
2263 /*
2264 * We have reached a deadlock, probably because the
2265 * process is executing non-async-signal-safe code in
2266 * a signal handler and is attempting to acquire a lock
2267 * that it already owns. This is not surprising, given
2268 * bad programming practices over the years that has
2269 * resulted in applications calling printf() and such
2270 * in their signal handlers. Unless the user has told
2271 * us that the signal handlers are safe by setting:
2272 * export _THREAD_ASYNC_SAFE=1
2273 * we return EDEADLK rather than actually deadlocking.
2274 *
2275 * A lock may explicitly override this with the
2276 * LOCK_DEADLOCK flag which is currently set for POSIX
2277 * NORMAL mutexes as the specification requires deadlock
2278 * behavior and applications _do_ rely on that for their
2279 * correctness guarantees.
2280 */
2281 if (tsp == NULL &&
2282 MUTEX_OWNER(mp) == self && !self->ul_async_safe &&
2283 (mp->mutex_flag & LOCK_DEADLOCK) == 0) {
2284 DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
2285 return (EDEADLK);
2286 }
2287 }
2288
2289 /*
2290 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2291 * no error detection, and no lock statistics.
2292 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2293 */
2294 if ((gflags = self->ul_schedctl_called) != NULL &&
2295 (gflags->uf_trs_ted |
2296 (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
2297 if (mtype & USYNC_PROCESS)
2298 return (fast_process_lock(mp, tsp, mtype, MUTEX_LOCK));
2299 sigoff(self);
2300 if (set_lock_byte(&mp->mutex_lockw) == 0) {
2301 mp->mutex_owner = (uintptr_t)self;
2302 sigon(self);
2303 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2304 return (0);
2305 }
2306 sigon(self);
2307 if (mtype && MUTEX_OWNER(mp) == self)
2308 return (mutex_recursion(mp, mtype, MUTEX_LOCK));
2309 if (mutex_trylock_adaptive(mp, 1) != 0)
2310 return (mutex_lock_queue(self, NULL, mp, tsp));
2311 return (0);
2312 }
2313
2314 /* else do it the long way */
2315 return (mutex_lock_internal(mp, tsp, MUTEX_LOCK));
2316 }
2317
2318 #pragma weak pthread_mutex_lock = mutex_lock
2319 #pragma weak _mutex_lock = mutex_lock
2320 int
mutex_lock(mutex_t * mp)2321 mutex_lock(mutex_t *mp)
2322 {
2323 ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2324 return (mutex_lock_impl(mp, NULL));
2325 }
2326
2327 #pragma weak pthread_mutex_enter_np = mutex_enter
2328 void
mutex_enter(mutex_t * mp)2329 mutex_enter(mutex_t *mp)
2330 {
2331 int ret;
2332 int attr = mp->mutex_type & ALL_ATTRIBUTES;
2333
2334 /*
2335 * Require LOCK_ERRORCHECK, accept LOCK_RECURSIVE.
2336 */
2337 if (attr != LOCK_ERRORCHECK &&
2338 attr != (LOCK_ERRORCHECK | LOCK_RECURSIVE)) {
2339 mutex_panic(mp, "mutex_enter: bad mutex type");
2340 }
2341 ret = mutex_lock(mp);
2342 if (ret == EDEADLK) {
2343 mutex_panic(mp, "recursive mutex_enter");
2344 } else if (ret == EAGAIN) {
2345 mutex_panic(mp, "excessive recursive mutex_enter");
2346 } else if (ret != 0) {
2347 mutex_panic(mp, "unknown mutex_enter failure");
2348 }
2349 }
2350
2351 int
pthread_mutex_clocklock(pthread_mutex_t * restrict mp,clockid_t clock,const struct timespec * restrict abstime)2352 pthread_mutex_clocklock(pthread_mutex_t *restrict mp, clockid_t clock,
2353 const struct timespec *restrict abstime)
2354 {
2355 timespec_t tslocal;
2356 int error;
2357
2358 ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2359
2360 switch (clock) {
2361 case CLOCK_REALTIME:
2362 case CLOCK_HIGHRES:
2363 break;
2364 default:
2365 return (EINVAL);
2366 }
2367
2368 abstime_to_reltime(clock, abstime, &tslocal);
2369 error = mutex_lock_impl((mutex_t *)mp, &tslocal);
2370 if (error == ETIME)
2371 error = ETIMEDOUT;
2372 return (error);
2373 }
2374
2375 int
pthread_mutex_timedlock(pthread_mutex_t * restrict mp,const struct timespec * restrict abstime)2376 pthread_mutex_timedlock(pthread_mutex_t *restrict mp,
2377 const struct timespec *restrict abstime)
2378 {
2379 return (pthread_mutex_clocklock(mp, CLOCK_REALTIME, abstime));
2380 }
2381
2382 int
pthread_mutex_relclocklock_np(pthread_mutex_t * restrict mp,clockid_t clock,const struct timespec * restrict reltime)2383 pthread_mutex_relclocklock_np(pthread_mutex_t *restrict mp, clockid_t clock,
2384 const struct timespec *restrict reltime)
2385 {
2386 timespec_t tslocal;
2387 int error;
2388
2389 ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2390
2391 switch (clock) {
2392 case CLOCK_REALTIME:
2393 case CLOCK_HIGHRES:
2394 break;
2395 default:
2396 return (EINVAL);
2397 }
2398
2399 tslocal = *reltime;
2400 error = mutex_lock_impl((mutex_t *)mp, &tslocal);
2401 if (error == ETIME)
2402 error = ETIMEDOUT;
2403 return (error);
2404 }
2405
2406 int
pthread_mutex_reltimedlock_np(pthread_mutex_t * restrict mp,const struct timespec * restrict reltime)2407 pthread_mutex_reltimedlock_np(pthread_mutex_t *restrict mp,
2408 const struct timespec *restrict reltime)
2409 {
2410 return (pthread_mutex_relclocklock_np(mp, CLOCK_REALTIME, reltime));
2411 }
2412
2413 #pragma weak pthread_mutex_trylock = mutex_trylock
2414 int
mutex_trylock(mutex_t * mp)2415 mutex_trylock(mutex_t *mp)
2416 {
2417 ulwp_t *self = curthread;
2418 uberdata_t *udp = self->ul_uberdata;
2419 int mtype = mp->mutex_type;
2420 uberflags_t *gflags;
2421
2422 ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2423
2424 /*
2425 * Optimize the case of USYNC_THREAD, including
2426 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2427 * no error detection, no lock statistics,
2428 * and the process has only a single thread.
2429 * (Most likely a traditional single-threaded application.)
2430 */
2431 if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2432 udp->uberflags.uf_all) == 0) {
2433 /*
2434 * Only one thread exists so we don't need an atomic operation.
2435 * We do, however, need to protect against signals.
2436 */
2437 if (mp->mutex_lockw == 0) {
2438 sigoff(self);
2439 mp->mutex_lockw = LOCKSET;
2440 mp->mutex_owner = (uintptr_t)self;
2441 sigon(self);
2442 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2443 return (0);
2444 }
2445 if (mtype && MUTEX_OWNER(mp) == self)
2446 return (mutex_recursion(mp, mtype, MUTEX_TRY));
2447 return (EBUSY);
2448 }
2449
2450 /*
2451 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2452 * no error detection, and no lock statistics.
2453 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2454 */
2455 if ((gflags = self->ul_schedctl_called) != NULL &&
2456 (gflags->uf_trs_ted |
2457 (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
2458 if (mtype & USYNC_PROCESS)
2459 return (fast_process_lock(mp, NULL, mtype, MUTEX_TRY));
2460 sigoff(self);
2461 if (set_lock_byte(&mp->mutex_lockw) == 0) {
2462 mp->mutex_owner = (uintptr_t)self;
2463 sigon(self);
2464 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2465 return (0);
2466 }
2467 sigon(self);
2468 if (mtype && MUTEX_OWNER(mp) == self)
2469 return (mutex_recursion(mp, mtype, MUTEX_TRY));
2470 if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2471 self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2472 tdb_event(TD_LOCK_TRY, udp);
2473 }
2474 return (EBUSY);
2475 }
2476
2477 /* else do it the long way */
2478 return (mutex_lock_internal(mp, NULL, MUTEX_TRY));
2479 }
2480
2481 int
mutex_unlock_internal(mutex_t * mp,int retain_robust_flags)2482 mutex_unlock_internal(mutex_t *mp, int retain_robust_flags)
2483 {
2484 ulwp_t *self = curthread;
2485 uberdata_t *udp = self->ul_uberdata;
2486 int mtype = mp->mutex_type;
2487 tdb_mutex_stats_t *msp;
2488 int error = 0;
2489 int release_all;
2490 lwpid_t lwpid;
2491
2492 if ((mtype & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
2493 !mutex_held(mp))
2494 return (EPERM);
2495
2496 if (self->ul_error_detection && !mutex_held(mp))
2497 lock_error(mp, "mutex_unlock", NULL, NULL);
2498
2499 if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2500 mp->mutex_rcount--;
2501 DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2502 return (0);
2503 }
2504
2505 if ((msp = MUTEX_STATS(mp, udp)) != NULL)
2506 (void) record_hold_time(msp);
2507
2508 if (!retain_robust_flags && !(mtype & LOCK_PRIO_INHERIT) &&
2509 (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2510 ASSERT(mtype & LOCK_ROBUST);
2511 mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2512 mp->mutex_flag |= LOCK_NOTRECOVERABLE;
2513 }
2514 release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
2515
2516 if (mtype & LOCK_PRIO_INHERIT) {
2517 no_preempt(self);
2518 mp->mutex_owner = 0;
2519 /* mp->mutex_ownerpid is cleared by ___lwp_mutex_unlock() */
2520 DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2521 mp->mutex_lockw = LOCKCLEAR;
2522 self->ul_pilocks--;
2523 error = ___lwp_mutex_unlock(mp);
2524 preempt(self);
2525 } else if (mtype & USYNC_PROCESS) {
2526 mutex_unlock_process(mp, release_all);
2527 } else { /* USYNC_THREAD */
2528 if ((lwpid = mutex_unlock_queue(mp, release_all)) != 0) {
2529 (void) __lwp_unpark(lwpid);
2530 preempt(self);
2531 }
2532 }
2533
2534 if (mtype & LOCK_ROBUST)
2535 forget_lock(mp);
2536
2537 if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
2538 _ceil_prio_waive();
2539
2540 return (error);
2541 }
2542
2543 #pragma weak pthread_mutex_unlock = mutex_unlock
2544 #pragma weak _mutex_unlock = mutex_unlock
2545 int
mutex_unlock(mutex_t * mp)2546 mutex_unlock(mutex_t *mp)
2547 {
2548 ulwp_t *self = curthread;
2549 int mtype = mp->mutex_type;
2550 uberflags_t *gflags;
2551 lwpid_t lwpid;
2552 short el;
2553
2554 /*
2555 * Optimize the case of USYNC_THREAD, including
2556 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2557 * no error detection, no lock statistics,
2558 * and the process has only a single thread.
2559 * (Most likely a traditional single-threaded application.)
2560 */
2561 if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2562 self->ul_uberdata->uberflags.uf_all) == 0) {
2563 if (mtype) {
2564 /*
2565 * At this point we know that one or both of the
2566 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2567 */
2568 if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2569 return (EPERM);
2570 if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2571 mp->mutex_rcount--;
2572 DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2573 return (0);
2574 }
2575 }
2576 /*
2577 * Only one thread exists so we don't need an atomic operation.
2578 * Also, there can be no waiters.
2579 */
2580 sigoff(self);
2581 mp->mutex_owner = 0;
2582 mp->mutex_lockword = 0;
2583 sigon(self);
2584 DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2585 return (0);
2586 }
2587
2588 /*
2589 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2590 * no error detection, and no lock statistics.
2591 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2592 */
2593 if ((gflags = self->ul_schedctl_called) != NULL) {
2594 if (((el = gflags->uf_trs_ted) | mtype) == 0) {
2595 fast_unlock:
2596 if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2597 (void) __lwp_unpark(lwpid);
2598 preempt(self);
2599 }
2600 return (0);
2601 }
2602 if (el) /* error detection or lock statistics */
2603 goto slow_unlock;
2604 if ((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2605 /*
2606 * At this point we know that one or both of the
2607 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2608 */
2609 if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2610 return (EPERM);
2611 if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2612 mp->mutex_rcount--;
2613 DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2614 return (0);
2615 }
2616 goto fast_unlock;
2617 }
2618 if ((mtype &
2619 ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2620 /*
2621 * At this point we know that zero, one, or both of the
2622 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set and
2623 * that the USYNC_PROCESS flag is set.
2624 */
2625 if ((mtype & LOCK_ERRORCHECK) && !shared_mutex_held(mp))
2626 return (EPERM);
2627 if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2628 mp->mutex_rcount--;
2629 DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2630 return (0);
2631 }
2632 mutex_unlock_process(mp, 0);
2633 return (0);
2634 }
2635 }
2636
2637 /* else do it the long way */
2638 slow_unlock:
2639 return (mutex_unlock_internal(mp, 0));
2640 }
2641
2642 #pragma weak pthread_mutex_exit_np = mutex_exit
2643 void
mutex_exit(mutex_t * mp)2644 mutex_exit(mutex_t *mp)
2645 {
2646 int ret;
2647 int attr = mp->mutex_type & ALL_ATTRIBUTES;
2648
2649 if (attr != LOCK_ERRORCHECK &&
2650 attr != (LOCK_ERRORCHECK | LOCK_RECURSIVE)) {
2651 mutex_panic(mp, "mutex_exit: bad mutex type");
2652 }
2653 ret = mutex_unlock(mp);
2654 if (ret == EPERM) {
2655 mutex_panic(mp, "mutex_exit: not owner");
2656 } else if (ret != 0) {
2657 mutex_panic(mp, "unknown mutex_exit failure");
2658 }
2659
2660 }
2661
2662 /*
2663 * Internally to the library, almost all mutex lock/unlock actions
2664 * go through these lmutex_ functions, to protect critical regions.
2665 * We replicate a bit of code from mutex_lock() and mutex_unlock()
2666 * to make these functions faster since we know that the mutex type
2667 * of all internal locks is USYNC_THREAD. We also know that internal
2668 * locking can never fail, so we panic if it does.
2669 */
2670 void
lmutex_lock(mutex_t * mp)2671 lmutex_lock(mutex_t *mp)
2672 {
2673 ulwp_t *self = curthread;
2674 uberdata_t *udp = self->ul_uberdata;
2675
2676 ASSERT(mp->mutex_type == USYNC_THREAD);
2677
2678 enter_critical(self);
2679 /*
2680 * Optimize the case of no lock statistics and only a single thread.
2681 * (Most likely a traditional single-threaded application.)
2682 */
2683 if (udp->uberflags.uf_all == 0) {
2684 /*
2685 * Only one thread exists; the mutex must be free.
2686 */
2687 ASSERT(mp->mutex_lockw == 0);
2688 mp->mutex_lockw = LOCKSET;
2689 mp->mutex_owner = (uintptr_t)self;
2690 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2691 } else {
2692 tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2693
2694 if (!self->ul_schedctl_called)
2695 (void) setup_schedctl();
2696
2697 if (set_lock_byte(&mp->mutex_lockw) == 0) {
2698 mp->mutex_owner = (uintptr_t)self;
2699 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2700 } else if (mutex_trylock_adaptive(mp, 1) != 0) {
2701 (void) mutex_lock_queue(self, msp, mp, NULL);
2702 }
2703
2704 if (msp)
2705 record_begin_hold(msp);
2706 }
2707 }
2708
2709 void
lmutex_unlock(mutex_t * mp)2710 lmutex_unlock(mutex_t *mp)
2711 {
2712 ulwp_t *self = curthread;
2713 uberdata_t *udp = self->ul_uberdata;
2714
2715 ASSERT(mp->mutex_type == USYNC_THREAD);
2716
2717 /*
2718 * Optimize the case of no lock statistics and only a single thread.
2719 * (Most likely a traditional single-threaded application.)
2720 */
2721 if (udp->uberflags.uf_all == 0) {
2722 /*
2723 * Only one thread exists so there can be no waiters.
2724 */
2725 mp->mutex_owner = 0;
2726 mp->mutex_lockword = 0;
2727 DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2728 } else {
2729 tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2730 lwpid_t lwpid;
2731
2732 if (msp)
2733 (void) record_hold_time(msp);
2734 if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2735 (void) __lwp_unpark(lwpid);
2736 preempt(self);
2737 }
2738 }
2739 exit_critical(self);
2740 }
2741
2742 /*
2743 * For specialized code in libc, like the asynchronous i/o code,
2744 * the following sig_*() locking primitives are used in order
2745 * to make the code asynchronous signal safe. Signals are
2746 * deferred while locks acquired by these functions are held.
2747 */
2748 void
sig_mutex_lock(mutex_t * mp)2749 sig_mutex_lock(mutex_t *mp)
2750 {
2751 ulwp_t *self = curthread;
2752
2753 sigoff(self);
2754 (void) mutex_lock(mp);
2755 }
2756
2757 void
sig_mutex_unlock(mutex_t * mp)2758 sig_mutex_unlock(mutex_t *mp)
2759 {
2760 ulwp_t *self = curthread;
2761
2762 (void) mutex_unlock(mp);
2763 sigon(self);
2764 }
2765
2766 int
sig_mutex_trylock(mutex_t * mp)2767 sig_mutex_trylock(mutex_t *mp)
2768 {
2769 ulwp_t *self = curthread;
2770 int error;
2771
2772 sigoff(self);
2773 if ((error = mutex_trylock(mp)) != 0)
2774 sigon(self);
2775 return (error);
2776 }
2777
2778 /*
2779 * sig_cond_wait() is a cancellation point.
2780 */
2781 int
sig_cond_wait(cond_t * cv,mutex_t * mp)2782 sig_cond_wait(cond_t *cv, mutex_t *mp)
2783 {
2784 int error;
2785
2786 ASSERT(curthread->ul_sigdefer != 0);
2787 pthread_testcancel();
2788 error = __cond_wait(cv, mp);
2789 if (error == EINTR && curthread->ul_cursig) {
2790 sig_mutex_unlock(mp);
2791 /* take the deferred signal here */
2792 sig_mutex_lock(mp);
2793 }
2794 pthread_testcancel();
2795 return (error);
2796 }
2797
2798 /*
2799 * sig_cond_reltimedwait() is a cancellation point.
2800 */
2801 int
sig_cond_reltimedwait(cond_t * cv,mutex_t * mp,const timespec_t * ts)2802 sig_cond_reltimedwait(cond_t *cv, mutex_t *mp, const timespec_t *ts)
2803 {
2804 int error;
2805
2806 ASSERT(curthread->ul_sigdefer != 0);
2807 pthread_testcancel();
2808 error = __cond_reltimedwait(cv, mp, ts);
2809 if (error == EINTR && curthread->ul_cursig) {
2810 sig_mutex_unlock(mp);
2811 /* take the deferred signal here */
2812 sig_mutex_lock(mp);
2813 }
2814 pthread_testcancel();
2815 return (error);
2816 }
2817
2818 /*
2819 * For specialized code in libc, like the stdio code.
2820 * the following cancel_safe_*() locking primitives are used in
2821 * order to make the code cancellation-safe. Cancellation is
2822 * deferred while locks acquired by these functions are held.
2823 */
2824 void
cancel_safe_mutex_lock(mutex_t * mp)2825 cancel_safe_mutex_lock(mutex_t *mp)
2826 {
2827 (void) mutex_lock(mp);
2828 curthread->ul_libc_locks++;
2829 }
2830
2831 int
cancel_safe_mutex_trylock(mutex_t * mp)2832 cancel_safe_mutex_trylock(mutex_t *mp)
2833 {
2834 int error;
2835
2836 if ((error = mutex_trylock(mp)) == 0)
2837 curthread->ul_libc_locks++;
2838 return (error);
2839 }
2840
2841 void
cancel_safe_mutex_unlock(mutex_t * mp)2842 cancel_safe_mutex_unlock(mutex_t *mp)
2843 {
2844 ulwp_t *self = curthread;
2845
2846 ASSERT(self->ul_libc_locks != 0);
2847
2848 (void) mutex_unlock(mp);
2849
2850 /*
2851 * Decrement the count of locks held by cancel_safe_mutex_lock().
2852 * If we are then in a position to terminate cleanly and
2853 * if there is a pending cancellation and cancellation
2854 * is not disabled and we received EINTR from a recent
2855 * system call then perform the cancellation action now.
2856 */
2857 if (--self->ul_libc_locks == 0 &&
2858 !(self->ul_vfork | self->ul_nocancel |
2859 self->ul_critical | self->ul_sigdefer) &&
2860 cancel_active())
2861 pthread_exit(PTHREAD_CANCELED);
2862 }
2863
2864 static int
shared_mutex_held(mutex_t * mparg)2865 shared_mutex_held(mutex_t *mparg)
2866 {
2867 /*
2868 * The 'volatile' is necessary to make sure the compiler doesn't
2869 * reorder the tests of the various components of the mutex.
2870 * They must be tested in this order:
2871 * mutex_lockw
2872 * mutex_owner
2873 * mutex_ownerpid
2874 * This relies on the fact that everywhere mutex_lockw is cleared,
2875 * mutex_owner and mutex_ownerpid are cleared before mutex_lockw
2876 * is cleared, and that everywhere mutex_lockw is set, mutex_owner
2877 * and mutex_ownerpid are set after mutex_lockw is set, and that
2878 * mutex_lockw is set or cleared with a memory barrier.
2879 */
2880 volatile mutex_t *mp = (volatile mutex_t *)mparg;
2881 ulwp_t *self = curthread;
2882 uberdata_t *udp = self->ul_uberdata;
2883
2884 return (MUTEX_OWNED(mp, self) && mp->mutex_ownerpid == udp->pid);
2885 }
2886
2887 #pragma weak _mutex_held = mutex_held
2888 int
mutex_held(mutex_t * mparg)2889 mutex_held(mutex_t *mparg)
2890 {
2891 volatile mutex_t *mp = (volatile mutex_t *)mparg;
2892
2893 if (mparg->mutex_type & USYNC_PROCESS)
2894 return (shared_mutex_held(mparg));
2895 return (MUTEX_OWNED(mp, curthread));
2896 }
2897
2898 #pragma weak pthread_mutex_destroy = mutex_destroy
2899 #pragma weak _mutex_destroy = mutex_destroy
2900 int
mutex_destroy(mutex_t * mp)2901 mutex_destroy(mutex_t *mp)
2902 {
2903 if (mp->mutex_type & USYNC_PROCESS)
2904 forget_lock(mp);
2905 (void) memset(mp, 0, sizeof (*mp));
2906 tdb_sync_obj_deregister(mp);
2907 return (0);
2908 }
2909
2910 #pragma weak pthread_mutex_consistent_np = mutex_consistent
2911 #pragma weak pthread_mutex_consistent = mutex_consistent
2912 int
mutex_consistent(mutex_t * mp)2913 mutex_consistent(mutex_t *mp)
2914 {
2915 /*
2916 * Do this only for an inconsistent, initialized robust lock
2917 * that we hold. For all other cases, return EINVAL.
2918 */
2919 if (mutex_held(mp) &&
2920 (mp->mutex_type & LOCK_ROBUST) &&
2921 (mp->mutex_flag & LOCK_INITED) &&
2922 (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2923 mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2924 mp->mutex_rcount = 0;
2925 return (0);
2926 }
2927 return (EINVAL);
2928 }
2929
2930 /*
2931 * Spin locks are separate from ordinary mutexes,
2932 * but we use the same data structure for them.
2933 */
2934
2935 int
pthread_spin_init(pthread_spinlock_t * lock,int pshared)2936 pthread_spin_init(pthread_spinlock_t *lock, int pshared)
2937 {
2938 mutex_t *mp = (mutex_t *)lock;
2939
2940 (void) memset(mp, 0, sizeof (*mp));
2941 if (pshared == PTHREAD_PROCESS_SHARED)
2942 mp->mutex_type = USYNC_PROCESS;
2943 else
2944 mp->mutex_type = USYNC_THREAD;
2945 mp->mutex_flag = LOCK_INITED;
2946 mp->mutex_magic = MUTEX_MAGIC;
2947
2948 /*
2949 * This should be at the beginning of the function,
2950 * but for the sake of old broken applications that
2951 * do not have proper alignment for their mutexes
2952 * (and don't check the return code from pthread_spin_init),
2953 * we put it here, after initializing the mutex regardless.
2954 */
2955 if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2956 curthread->ul_misaligned == 0)
2957 return (EINVAL);
2958
2959 return (0);
2960 }
2961
2962 int
pthread_spin_destroy(pthread_spinlock_t * lock)2963 pthread_spin_destroy(pthread_spinlock_t *lock)
2964 {
2965 (void) memset(lock, 0, sizeof (*lock));
2966 return (0);
2967 }
2968
2969 int
pthread_spin_trylock(pthread_spinlock_t * lock)2970 pthread_spin_trylock(pthread_spinlock_t *lock)
2971 {
2972 mutex_t *mp = (mutex_t *)lock;
2973 ulwp_t *self = curthread;
2974 int error = 0;
2975
2976 no_preempt(self);
2977 if (set_lock_byte(&mp->mutex_lockw) != 0)
2978 error = EBUSY;
2979 else {
2980 mp->mutex_owner = (uintptr_t)self;
2981 if (mp->mutex_type == USYNC_PROCESS)
2982 mp->mutex_ownerpid = self->ul_uberdata->pid;
2983 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2984 }
2985 preempt(self);
2986 return (error);
2987 }
2988
2989 int
pthread_spin_lock(pthread_spinlock_t * lock)2990 pthread_spin_lock(pthread_spinlock_t *lock)
2991 {
2992 mutex_t *mp = (mutex_t *)lock;
2993 ulwp_t *self = curthread;
2994 volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
2995 int count = 0;
2996
2997 ASSERT(!self->ul_critical || self->ul_bindflags);
2998
2999 DTRACE_PROBE1(plockstat, mutex__spin, mp);
3000
3001 /*
3002 * We don't care whether the owner is running on a processor.
3003 * We just spin because that's what this interface requires.
3004 */
3005 for (;;) {
3006 if (*lockp == 0) { /* lock byte appears to be clear */
3007 no_preempt(self);
3008 if (set_lock_byte(lockp) == 0)
3009 break;
3010 preempt(self);
3011 }
3012 if (count < INT_MAX)
3013 count++;
3014 SMT_PAUSE();
3015 }
3016 mp->mutex_owner = (uintptr_t)self;
3017 if (mp->mutex_type == USYNC_PROCESS)
3018 mp->mutex_ownerpid = self->ul_uberdata->pid;
3019 preempt(self);
3020 if (count) {
3021 DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
3022 }
3023 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
3024 return (0);
3025 }
3026
3027 int
pthread_spin_unlock(pthread_spinlock_t * lock)3028 pthread_spin_unlock(pthread_spinlock_t *lock)
3029 {
3030 mutex_t *mp = (mutex_t *)lock;
3031 ulwp_t *self = curthread;
3032
3033 no_preempt(self);
3034 mp->mutex_owner = 0;
3035 mp->mutex_ownerpid = 0;
3036 DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
3037 (void) atomic_swap_32(&mp->mutex_lockword, 0);
3038 preempt(self);
3039 return (0);
3040 }
3041
3042 #define INITIAL_LOCKS 8 /* initial size of ul_heldlocks.array */
3043
3044 /*
3045 * Find/allocate an entry for 'lock' in our array of held locks.
3046 */
3047 static mutex_t **
find_lock_entry(mutex_t * lock)3048 find_lock_entry(mutex_t *lock)
3049 {
3050 ulwp_t *self = curthread;
3051 mutex_t **remembered = NULL;
3052 mutex_t **lockptr;
3053 uint_t nlocks;
3054
3055 if ((nlocks = self->ul_heldlockcnt) != 0)
3056 lockptr = self->ul_heldlocks.array;
3057 else {
3058 nlocks = 1;
3059 lockptr = &self->ul_heldlocks.single;
3060 }
3061
3062 for (; nlocks; nlocks--, lockptr++) {
3063 if (*lockptr == lock)
3064 return (lockptr);
3065 if (*lockptr == NULL && remembered == NULL)
3066 remembered = lockptr;
3067 }
3068 if (remembered != NULL) {
3069 *remembered = lock;
3070 return (remembered);
3071 }
3072
3073 /*
3074 * No entry available. Allocate more space, converting
3075 * the single entry into an array of entries if necessary.
3076 */
3077 if ((nlocks = self->ul_heldlockcnt) == 0) {
3078 /*
3079 * Initial allocation of the array.
3080 * Convert the single entry into an array.
3081 */
3082 self->ul_heldlockcnt = nlocks = INITIAL_LOCKS;
3083 lockptr = lmalloc(nlocks * sizeof (mutex_t *));
3084 /*
3085 * The single entry becomes the first entry in the array.
3086 */
3087 *lockptr = self->ul_heldlocks.single;
3088 self->ul_heldlocks.array = lockptr;
3089 /*
3090 * Return the next available entry in the array.
3091 */
3092 *++lockptr = lock;
3093 return (lockptr);
3094 }
3095 /*
3096 * Reallocate the array, double the size each time.
3097 */
3098 lockptr = lmalloc(nlocks * 2 * sizeof (mutex_t *));
3099 (void) memcpy(lockptr, self->ul_heldlocks.array,
3100 nlocks * sizeof (mutex_t *));
3101 lfree(self->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
3102 self->ul_heldlocks.array = lockptr;
3103 self->ul_heldlockcnt *= 2;
3104 /*
3105 * Return the next available entry in the newly allocated array.
3106 */
3107 *(lockptr += nlocks) = lock;
3108 return (lockptr);
3109 }
3110
3111 /*
3112 * Insert 'lock' into our list of held locks.
3113 * Currently only used for LOCK_ROBUST mutexes.
3114 */
3115 void
remember_lock(mutex_t * lock)3116 remember_lock(mutex_t *lock)
3117 {
3118 (void) find_lock_entry(lock);
3119 }
3120
3121 /*
3122 * Remove 'lock' from our list of held locks.
3123 * Currently only used for LOCK_ROBUST mutexes.
3124 */
3125 void
forget_lock(mutex_t * lock)3126 forget_lock(mutex_t *lock)
3127 {
3128 *find_lock_entry(lock) = NULL;
3129 }
3130
3131 /*
3132 * Free the array of held locks.
3133 */
3134 void
heldlock_free(ulwp_t * ulwp)3135 heldlock_free(ulwp_t *ulwp)
3136 {
3137 uint_t nlocks;
3138
3139 if ((nlocks = ulwp->ul_heldlockcnt) != 0)
3140 lfree(ulwp->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
3141 ulwp->ul_heldlockcnt = 0;
3142 ulwp->ul_heldlocks.array = NULL;
3143 }
3144
3145 /*
3146 * Mark all held LOCK_ROBUST mutexes LOCK_OWNERDEAD.
3147 * Called from _thrp_exit() to deal with abandoned locks.
3148 */
3149 void
heldlock_exit(void)3150 heldlock_exit(void)
3151 {
3152 ulwp_t *self = curthread;
3153 mutex_t **lockptr;
3154 uint_t nlocks;
3155 mutex_t *mp;
3156
3157 if ((nlocks = self->ul_heldlockcnt) != 0)
3158 lockptr = self->ul_heldlocks.array;
3159 else {
3160 nlocks = 1;
3161 lockptr = &self->ul_heldlocks.single;
3162 }
3163
3164 for (; nlocks; nlocks--, lockptr++) {
3165 /*
3166 * The kernel takes care of transitioning held
3167 * LOCK_PRIO_INHERIT mutexes to LOCK_OWNERDEAD.
3168 * We avoid that case here.
3169 */
3170 if ((mp = *lockptr) != NULL &&
3171 mutex_held(mp) &&
3172 (mp->mutex_type & (LOCK_ROBUST | LOCK_PRIO_INHERIT)) ==
3173 LOCK_ROBUST) {
3174 mp->mutex_rcount = 0;
3175 if (!(mp->mutex_flag & LOCK_UNMAPPED))
3176 mp->mutex_flag |= LOCK_OWNERDEAD;
3177 (void) mutex_unlock_internal(mp, 1);
3178 }
3179 }
3180
3181 heldlock_free(self);
3182 }
3183
3184 #pragma weak _cond_init = cond_init
3185 int
cond_init(cond_t * cvp,int type,void * arg __unused)3186 cond_init(cond_t *cvp, int type, void *arg __unused)
3187 {
3188 if (type != USYNC_THREAD && type != USYNC_PROCESS)
3189 return (EINVAL);
3190
3191 /*
3192 * This memset initializes cond_clock to CLOCK_REALTIME.
3193 */
3194 (void) memset(cvp, 0, sizeof (*cvp));
3195 cvp->cond_type = (uint16_t)type;
3196 cvp->cond_magic = COND_MAGIC;
3197
3198 /*
3199 * This should be at the beginning of the function,
3200 * but for the sake of old broken applications that
3201 * do not have proper alignment for their condvars
3202 * (and don't check the return code from cond_init),
3203 * we put it here, after initializing the condvar regardless.
3204 */
3205 if (((uintptr_t)cvp & (_LONG_LONG_ALIGNMENT - 1)) &&
3206 curthread->ul_misaligned == 0)
3207 return (EINVAL);
3208
3209 return (0);
3210 }
3211
3212 /*
3213 * cond_sleep_queue(): utility function for cond_wait_queue().
3214 *
3215 * Go to sleep on a condvar sleep queue, expect to be waked up
3216 * by someone calling cond_signal() or cond_broadcast() or due
3217 * to receiving a UNIX signal or being cancelled, or just simply
3218 * due to a spurious wakeup (like someome calling forkall()).
3219 *
3220 * The associated mutex is *not* reacquired before returning.
3221 * That must be done by the caller of cond_sleep_queue().
3222 */
3223 static int
cond_sleep_queue(cond_t * cvp,mutex_t * mp,timespec_t * tsp)3224 cond_sleep_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3225 {
3226 ulwp_t *self = curthread;
3227 queue_head_t *qp;
3228 queue_head_t *mqp;
3229 lwpid_t lwpid;
3230 int signalled;
3231 int error;
3232 int cv_wake;
3233 int release_all;
3234
3235 /*
3236 * Put ourself on the CV sleep queue, unlock the mutex, then
3237 * park ourself and unpark a candidate lwp to grab the mutex.
3238 * We must go onto the CV sleep queue before dropping the
3239 * mutex in order to guarantee atomicity of the operation.
3240 */
3241 self->ul_sp = stkptr();
3242 qp = queue_lock(cvp, CV);
3243 enqueue(qp, self, 0);
3244 cvp->cond_waiters_user = 1;
3245 self->ul_cvmutex = mp;
3246 self->ul_cv_wake = cv_wake = (tsp != NULL);
3247 self->ul_signalled = 0;
3248 if (mp->mutex_flag & LOCK_OWNERDEAD) {
3249 mp->mutex_flag &= ~LOCK_OWNERDEAD;
3250 mp->mutex_flag |= LOCK_NOTRECOVERABLE;
3251 }
3252 release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
3253 lwpid = mutex_unlock_queue(mp, release_all);
3254 for (;;) {
3255 set_parking_flag(self, 1);
3256 queue_unlock(qp);
3257 if (lwpid != 0) {
3258 lwpid = preempt_unpark(self, lwpid);
3259 preempt(self);
3260 }
3261 /*
3262 * We may have a deferred signal present,
3263 * in which case we should return EINTR.
3264 * Also, we may have received a SIGCANCEL; if so
3265 * and we are cancelable we should return EINTR.
3266 * We force an immediate EINTR return from
3267 * __lwp_park() by turning our parking flag off.
3268 */
3269 if (self->ul_cursig != 0 ||
3270 (self->ul_cancelable && self->ul_cancel_pending))
3271 set_parking_flag(self, 0);
3272 /*
3273 * __lwp_park() will return the residual time in tsp
3274 * if we are unparked before the timeout expires.
3275 */
3276 error = __lwp_park(tsp, lwpid);
3277 set_parking_flag(self, 0);
3278 lwpid = 0; /* unpark the other lwp only once */
3279 /*
3280 * We were waked up by cond_signal(), cond_broadcast(),
3281 * by an interrupt or timeout (EINTR or ETIME),
3282 * or we may just have gotten a spurious wakeup.
3283 */
3284 qp = queue_lock(cvp, CV);
3285 if (!cv_wake)
3286 mqp = queue_lock(mp, MX);
3287 if (self->ul_sleepq == NULL)
3288 break;
3289 /*
3290 * We are on either the condvar sleep queue or the
3291 * mutex sleep queue. Break out of the sleep if we
3292 * were interrupted or we timed out (EINTR or ETIME).
3293 * Else this is a spurious wakeup; continue the loop.
3294 */
3295 if (!cv_wake && self->ul_sleepq == mqp) { /* mutex queue */
3296 if (error) {
3297 mp->mutex_waiters = dequeue_self(mqp);
3298 break;
3299 }
3300 tsp = NULL; /* no more timeout */
3301 } else if (self->ul_sleepq == qp) { /* condvar queue */
3302 if (error) {
3303 cvp->cond_waiters_user = dequeue_self(qp);
3304 break;
3305 }
3306 /*
3307 * Else a spurious wakeup on the condvar queue.
3308 * __lwp_park() has already adjusted the timeout.
3309 */
3310 } else {
3311 thr_panic("cond_sleep_queue(): thread not on queue");
3312 }
3313 if (!cv_wake)
3314 queue_unlock(mqp);
3315 }
3316
3317 self->ul_sp = 0;
3318 self->ul_cv_wake = 0;
3319 ASSERT(self->ul_cvmutex == NULL);
3320 ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
3321 self->ul_wchan == NULL);
3322
3323 signalled = self->ul_signalled;
3324 self->ul_signalled = 0;
3325 queue_unlock(qp);
3326 if (!cv_wake)
3327 queue_unlock(mqp);
3328
3329 /*
3330 * If we were concurrently cond_signal()d and any of:
3331 * received a UNIX signal, were cancelled, or got a timeout,
3332 * then perform another cond_signal() to avoid consuming it.
3333 */
3334 if (error && signalled)
3335 (void) cond_signal(cvp);
3336
3337 return (error);
3338 }
3339
3340 static void
cond_wait_check_alignment(cond_t * cvp,mutex_t * mp)3341 cond_wait_check_alignment(cond_t *cvp, mutex_t *mp)
3342 {
3343 if ((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1))
3344 lock_error(mp, "cond_wait", cvp, "mutex is misaligned");
3345 if ((uintptr_t)cvp & (_LONG_LONG_ALIGNMENT - 1))
3346 lock_error(mp, "cond_wait", cvp, "condvar is misaligned");
3347 }
3348
3349 int
cond_wait_queue(cond_t * cvp,mutex_t * mp,timespec_t * tsp)3350 cond_wait_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3351 {
3352 ulwp_t *self = curthread;
3353 int error;
3354 int merror;
3355
3356 if (self->ul_error_detection && self->ul_misaligned == 0)
3357 cond_wait_check_alignment(cvp, mp);
3358
3359 /*
3360 * The old thread library was programmed to defer signals
3361 * while in cond_wait() so that the associated mutex would
3362 * be guaranteed to be held when the application signal
3363 * handler was invoked.
3364 *
3365 * We do not behave this way by default; the state of the
3366 * associated mutex in the signal handler is undefined.
3367 *
3368 * To accommodate applications that depend on the old
3369 * behavior, the _THREAD_COND_WAIT_DEFER environment
3370 * variable can be set to 1 and we will behave in the
3371 * old way with respect to cond_wait().
3372 */
3373 if (self->ul_cond_wait_defer)
3374 sigoff(self);
3375
3376 error = cond_sleep_queue(cvp, mp, tsp);
3377
3378 /*
3379 * Reacquire the mutex.
3380 */
3381 if ((merror = mutex_lock_impl(mp, NULL)) != 0)
3382 error = merror;
3383
3384 /*
3385 * Take any deferred signal now, after we have reacquired the mutex.
3386 */
3387 if (self->ul_cond_wait_defer)
3388 sigon(self);
3389
3390 return (error);
3391 }
3392
3393 /*
3394 * cond_sleep_kernel(): utility function for cond_wait_kernel().
3395 * See the comment ahead of cond_sleep_queue(), above.
3396 */
3397 static int
cond_sleep_kernel(cond_t * cvp,mutex_t * mp,timespec_t * tsp)3398 cond_sleep_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3399 {
3400 int mtype = mp->mutex_type;
3401 ulwp_t *self = curthread;
3402 int error;
3403
3404 if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
3405 _ceil_prio_waive();
3406
3407 self->ul_sp = stkptr();
3408 self->ul_wchan = cvp;
3409 sigoff(self);
3410 mp->mutex_owner = 0;
3411 /* mp->mutex_ownerpid is cleared by ___lwp_cond_wait() */
3412 if (mtype & LOCK_PRIO_INHERIT) {
3413 mp->mutex_lockw = LOCKCLEAR;
3414 self->ul_pilocks--;
3415 }
3416 /*
3417 * ___lwp_cond_wait() returns immediately with EINTR if
3418 * set_parking_flag(self,0) is called on this lwp before it
3419 * goes to sleep in the kernel. sigacthandler() calls this
3420 * when a deferred signal is noted. This assures that we don't
3421 * get stuck in ___lwp_cond_wait() with all signals blocked
3422 * due to taking a deferred signal before going to sleep.
3423 */
3424 set_parking_flag(self, 1);
3425 if (self->ul_cursig != 0 ||
3426 (self->ul_cancelable && self->ul_cancel_pending))
3427 set_parking_flag(self, 0);
3428 error = ___lwp_cond_wait(cvp, mp, tsp, 1);
3429 set_parking_flag(self, 0);
3430 sigon(self);
3431 self->ul_sp = 0;
3432 self->ul_wchan = NULL;
3433 return (error);
3434 }
3435
3436 int
cond_wait_kernel(cond_t * cvp,mutex_t * mp,timespec_t * tsp)3437 cond_wait_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3438 {
3439 ulwp_t *self = curthread;
3440 int error;
3441 int merror;
3442
3443 if (self->ul_error_detection && self->ul_misaligned == 0)
3444 cond_wait_check_alignment(cvp, mp);
3445
3446 /*
3447 * See the large comment in cond_wait_queue(), above.
3448 */
3449 if (self->ul_cond_wait_defer)
3450 sigoff(self);
3451
3452 error = cond_sleep_kernel(cvp, mp, tsp);
3453
3454 /*
3455 * Override the return code from ___lwp_cond_wait()
3456 * with any non-zero return code from mutex_lock().
3457 * This addresses robust lock failures in particular;
3458 * the caller must see the EOWNERDEAD or ENOTRECOVERABLE
3459 * errors in order to take corrective action.
3460 */
3461 if ((merror = mutex_lock_impl(mp, NULL)) != 0)
3462 error = merror;
3463
3464 /*
3465 * Take any deferred signal now, after we have reacquired the mutex.
3466 */
3467 if (self->ul_cond_wait_defer)
3468 sigon(self);
3469
3470 return (error);
3471 }
3472
3473 /*
3474 * Common code for cond_wait() and cond_timedwait()
3475 */
3476 int
cond_wait_common(cond_t * cvp,mutex_t * mp,timespec_t * tsp)3477 cond_wait_common(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3478 {
3479 int mtype = mp->mutex_type;
3480 hrtime_t begin_sleep = 0;
3481 ulwp_t *self = curthread;
3482 uberdata_t *udp = self->ul_uberdata;
3483 tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3484 tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
3485 uint8_t rcount;
3486 int error = 0;
3487
3488 /*
3489 * The SUSV3 Posix spec for pthread_cond_timedwait() states:
3490 * Except in the case of [ETIMEDOUT], all these error checks
3491 * shall act as if they were performed immediately at the
3492 * beginning of processing for the function and shall cause
3493 * an error return, in effect, prior to modifying the state
3494 * of the mutex specified by mutex or the condition variable
3495 * specified by cond.
3496 * Therefore, we must return EINVAL now if the timout is invalid.
3497 */
3498 if (tsp != NULL &&
3499 (tsp->tv_sec < 0 || (ulong_t)tsp->tv_nsec >= NANOSEC))
3500 return (EINVAL);
3501
3502 if (__td_event_report(self, TD_SLEEP, udp)) {
3503 self->ul_sp = stkptr();
3504 self->ul_wchan = cvp;
3505 self->ul_td_evbuf.eventnum = TD_SLEEP;
3506 self->ul_td_evbuf.eventdata = cvp;
3507 tdb_event(TD_SLEEP, udp);
3508 self->ul_sp = 0;
3509 }
3510 if (csp) {
3511 if (tsp)
3512 tdb_incr(csp->cond_timedwait);
3513 else
3514 tdb_incr(csp->cond_wait);
3515 }
3516 if (msp)
3517 begin_sleep = record_hold_time(msp);
3518 else if (csp)
3519 begin_sleep = gethrtime();
3520
3521 if (self->ul_error_detection) {
3522 if (!mutex_held(mp))
3523 lock_error(mp, "cond_wait", cvp, NULL);
3524 if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0)
3525 lock_error(mp, "recursive mutex in cond_wait",
3526 cvp, NULL);
3527 if (cvp->cond_type & USYNC_PROCESS) {
3528 if (!(mtype & USYNC_PROCESS))
3529 lock_error(mp, "cond_wait", cvp,
3530 "condvar process-shared, "
3531 "mutex process-private");
3532 } else {
3533 if (mtype & USYNC_PROCESS)
3534 lock_error(mp, "cond_wait", cvp,
3535 "condvar process-private, "
3536 "mutex process-shared");
3537 }
3538 }
3539
3540 /*
3541 * We deal with recursive mutexes by completely
3542 * dropping the lock and restoring the recursion
3543 * count after waking up. This is arguably wrong,
3544 * but it obeys the principle of least astonishment.
3545 */
3546 rcount = mp->mutex_rcount;
3547 mp->mutex_rcount = 0;
3548 if ((mtype &
3549 (USYNC_PROCESS | LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT)) |
3550 (cvp->cond_type & USYNC_PROCESS))
3551 error = cond_wait_kernel(cvp, mp, tsp);
3552 else
3553 error = cond_wait_queue(cvp, mp, tsp);
3554 mp->mutex_rcount = rcount;
3555
3556 if (csp) {
3557 hrtime_t lapse = gethrtime() - begin_sleep;
3558 if (tsp == NULL)
3559 csp->cond_wait_sleep_time += lapse;
3560 else {
3561 csp->cond_timedwait_sleep_time += lapse;
3562 if (error == ETIME)
3563 tdb_incr(csp->cond_timedwait_timeout);
3564 }
3565 }
3566 return (error);
3567 }
3568
3569 /*
3570 * cond_wait() is a cancellation point but __cond_wait() is not.
3571 * Internally, libc calls the non-cancellation version.
3572 * Other libraries need to use pthread_setcancelstate(), as appropriate,
3573 * since __cond_wait() is not exported from libc.
3574 */
3575 int
__cond_wait(cond_t * cvp,mutex_t * mp)3576 __cond_wait(cond_t *cvp, mutex_t *mp)
3577 {
3578 ulwp_t *self = curthread;
3579 uberdata_t *udp = self->ul_uberdata;
3580 uberflags_t *gflags;
3581
3582 if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
3583 !mutex_held(mp))
3584 return (EPERM);
3585
3586 /*
3587 * Optimize the common case of USYNC_THREAD plus
3588 * no error detection, no lock statistics, and no event tracing.
3589 */
3590 if ((gflags = self->ul_schedctl_called) != NULL &&
3591 (cvp->cond_type | mp->mutex_type | gflags->uf_trs_ted |
3592 self->ul_td_events_enable |
3593 udp->tdb.tdb_ev_global_mask.event_bits[0]) == 0)
3594 return (cond_wait_queue(cvp, mp, NULL));
3595
3596 /*
3597 * Else do it the long way.
3598 */
3599 return (cond_wait_common(cvp, mp, NULL));
3600 }
3601
3602 #pragma weak _cond_wait = cond_wait
3603 int
cond_wait(cond_t * cvp,mutex_t * mp)3604 cond_wait(cond_t *cvp, mutex_t *mp)
3605 {
3606 int error;
3607
3608 _cancelon();
3609 error = __cond_wait(cvp, mp);
3610 if (error == EINTR)
3611 _canceloff();
3612 else
3613 _canceloff_nocancel();
3614 return (error);
3615 }
3616
3617 /*
3618 * pthread_cond_wait() is a cancellation point.
3619 */
3620 int
pthread_cond_wait(pthread_cond_t * restrict cvp,pthread_mutex_t * restrict mp)3621 pthread_cond_wait(pthread_cond_t *restrict cvp, pthread_mutex_t *restrict mp)
3622 {
3623 int error;
3624
3625 error = cond_wait((cond_t *)cvp, (mutex_t *)mp);
3626 return ((error == EINTR)? 0 : error);
3627 }
3628
3629 /*
3630 * cond_timedwait() is a cancellation point but __cond_timedwait() is not.
3631 */
3632 int
__cond_timedwait(cond_t * cvp,mutex_t * mp,clockid_t clock_id,const timespec_t * abstime)3633 __cond_timedwait(cond_t *cvp, mutex_t *mp, clockid_t clock_id,
3634 const timespec_t *abstime)
3635 {
3636 timespec_t reltime;
3637 int error;
3638
3639 if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
3640 !mutex_held(mp))
3641 return (EPERM);
3642
3643 if (clock_id != CLOCK_REALTIME && clock_id != CLOCK_HIGHRES)
3644 clock_id = CLOCK_REALTIME;
3645 abstime_to_reltime(clock_id, abstime, &reltime);
3646 error = cond_wait_common(cvp, mp, &reltime);
3647 if (error == ETIME && clock_id == CLOCK_HIGHRES) {
3648 /*
3649 * Don't return ETIME if we didn't really get a timeout.
3650 * This can happen if we return because someone resets
3651 * the system clock. Just return zero in this case,
3652 * giving a spurious wakeup but not a timeout.
3653 */
3654 if ((hrtime_t)(uint32_t)abstime->tv_sec * NANOSEC +
3655 abstime->tv_nsec > gethrtime())
3656 error = 0;
3657 }
3658 return (error);
3659 }
3660
3661 static int
cond_clockwait(cond_t * cvp,mutex_t * mp,clockid_t clock,const timespec_t * abstime)3662 cond_clockwait(cond_t *cvp, mutex_t *mp, clockid_t clock,
3663 const timespec_t *abstime)
3664 {
3665 int error;
3666
3667 _cancelon();
3668 error = __cond_timedwait(cvp, mp, clock, abstime);
3669 if (error == EINTR)
3670 _canceloff();
3671 else
3672 _canceloff_nocancel();
3673 return (error);
3674 }
3675
3676 /*
3677 * This is a function internal to libc that determines the clockid to return for
3678 * a cond_t. The cond_t (and the pthreads / C equivalent) encode a clock id that
3679 * should be used as a timing source. When using the static initializers, which
3680 * set this to zero, cond_clockid will end up set to __CLOCK_REALTIME0 which
3681 * isn't really used in the system any more. Consumers of the clockid call this
3682 * to translate this. Note, we fail open such that if someone has corrupted the
3683 * clockid it will end up in a well known clock to continue the traditional
3684 * system behavior.
3685 */
3686 static clockid_t
cond_clock(cond_t * cvp)3687 cond_clock(cond_t *cvp)
3688 {
3689 if (cvp->cond_clockid != CLOCK_REALTIME &&
3690 cvp->cond_clockid != CLOCK_MONOTONIC) {
3691 return (CLOCK_REALTIME);
3692 }
3693
3694 return (cvp->cond_clockid);
3695 }
3696
3697 int
cond_timedwait(cond_t * cvp,mutex_t * mp,const timespec_t * abstime)3698 cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3699 {
3700 return (cond_clockwait(cvp, mp, cond_clock(cvp), abstime));
3701 }
3702
3703 /*
3704 * pthread_cond_timedwait() and pthread_cond_clockwait() are cancellation
3705 * points. We need to check for cancellation before we evaluate whether the
3706 * clock is valid.
3707 */
3708 int
pthread_cond_clockwait(pthread_cond_t * restrict cvp,pthread_mutex_t * restrict mp,clockid_t clock,const struct timespec * restrict abstime)3709 pthread_cond_clockwait(pthread_cond_t *restrict cvp,
3710 pthread_mutex_t *restrict mp, clockid_t clock,
3711 const struct timespec *restrict abstime)
3712 {
3713 int error;
3714
3715 switch (clock) {
3716 case CLOCK_REALTIME:
3717 case CLOCK_HIGHRES:
3718 break;
3719 default:
3720 return (EINVAL);
3721 }
3722
3723 /* We need to translate between the native threads errors and POSIX */
3724 error = cond_clockwait((cond_t *)cvp, (mutex_t *)mp, clock, abstime);
3725 if (error == ETIME)
3726 error = ETIMEDOUT;
3727 else if (error == EINTR)
3728 error = 0;
3729 return (error);
3730 }
3731
3732 int
pthread_cond_timedwait(pthread_cond_t * restrict cvp,pthread_mutex_t * restrict mp,const struct timespec * restrict abstime)3733 pthread_cond_timedwait(pthread_cond_t *restrict cvp,
3734 pthread_mutex_t *restrict mp, const struct timespec *restrict abstime)
3735 {
3736 cond_t *cond = (cond_t *)cvp;
3737 return (pthread_cond_clockwait(cvp, mp, cond_clock(cond), abstime));
3738 }
3739
3740 /*
3741 * cond_reltimedwait() is a cancellation point but __cond_reltimedwait() is not.
3742 *
3743 * Note, this function does not actually consume the clock id. Internally all
3744 * waits are based upon the highres clock in the system and therefore the actual
3745 * clock used is ignored at this point.
3746 */
3747 int
__cond_reltimedwait(cond_t * cvp,mutex_t * mp,const timespec_t * reltime)3748 __cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3749 {
3750 timespec_t tslocal = *reltime;
3751
3752 if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
3753 !mutex_held(mp))
3754 return (EPERM);
3755
3756 return (cond_wait_common(cvp, mp, &tslocal));
3757 }
3758
3759 int
cond_reltimedwait(cond_t * cvp,mutex_t * mp,const timespec_t * reltime)3760 cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3761 {
3762 int error;
3763
3764 _cancelon();
3765 error = __cond_reltimedwait(cvp, mp, reltime);
3766 if (error == EINTR)
3767 _canceloff();
3768 else
3769 _canceloff_nocancel();
3770 return (error);
3771 }
3772
3773 int
pthread_cond_relclockwait_np(pthread_cond_t * restrict cvp,pthread_mutex_t * restrict mp,clockid_t clock,const struct timespec * restrict reltime)3774 pthread_cond_relclockwait_np(pthread_cond_t *restrict cvp,
3775 pthread_mutex_t *restrict mp, clockid_t clock,
3776 const struct timespec *restrict reltime)
3777 {
3778 int error;
3779
3780 switch (clock) {
3781 case CLOCK_REALTIME:
3782 case CLOCK_HIGHRES:
3783 break;
3784 default:
3785 return (EINVAL);
3786 }
3787
3788 error = cond_reltimedwait((cond_t *)cvp, (mutex_t *)mp, reltime);
3789 if (error == ETIME)
3790 error = ETIMEDOUT;
3791 else if (error == EINTR)
3792 error = 0;
3793 return (error);
3794 }
3795
3796 int
pthread_cond_reltimedwait_np(pthread_cond_t * restrict cvp,pthread_mutex_t * restrict mp,const struct timespec * restrict reltime)3797 pthread_cond_reltimedwait_np(pthread_cond_t *restrict cvp,
3798 pthread_mutex_t *restrict mp, const struct timespec *restrict reltime)
3799 {
3800 cond_t *cond = (cond_t *)cvp;
3801 return (pthread_cond_relclockwait_np(cvp, mp, cond_clock(cond),
3802 reltime));
3803 }
3804
3805 #pragma weak pthread_cond_signal = cond_signal
3806 #pragma weak _cond_signal = cond_signal
3807 int
cond_signal(cond_t * cvp)3808 cond_signal(cond_t *cvp)
3809 {
3810 ulwp_t *self = curthread;
3811 uberdata_t *udp = self->ul_uberdata;
3812 tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3813 int error = 0;
3814 int more;
3815 lwpid_t lwpid;
3816 queue_head_t *qp;
3817 mutex_t *mp;
3818 queue_head_t *mqp;
3819 ulwp_t **ulwpp;
3820 ulwp_t *ulwp;
3821 ulwp_t *prev;
3822
3823 if (csp)
3824 tdb_incr(csp->cond_signal);
3825
3826 if (cvp->cond_waiters_kernel) /* someone sleeping in the kernel? */
3827 error = _lwp_cond_signal(cvp);
3828
3829 if (!cvp->cond_waiters_user) /* no one sleeping at user-level */
3830 return (error);
3831
3832 /*
3833 * Move some thread from the condvar sleep queue to the mutex sleep
3834 * queue for the mutex that it will acquire on being waked up.
3835 * We can do this only if we own the mutex it will acquire.
3836 * If we do not own the mutex, or if its ul_cv_wake flag
3837 * is set, just dequeue and unpark it.
3838 */
3839 qp = queue_lock(cvp, CV);
3840 ulwpp = queue_slot(qp, &prev, &more);
3841 cvp->cond_waiters_user = more;
3842 if (ulwpp == NULL) { /* no one on the sleep queue */
3843 queue_unlock(qp);
3844 return (error);
3845 }
3846 ulwp = *ulwpp;
3847
3848 /*
3849 * Inform the thread that it was the recipient of a cond_signal().
3850 * This lets it deal with cond_signal() and, concurrently,
3851 * one or more of a cancellation, a UNIX signal, or a timeout.
3852 * These latter conditions must not consume a cond_signal().
3853 */
3854 ulwp->ul_signalled = 1;
3855
3856 /*
3857 * Dequeue the waiter but leave its ul_sleepq non-NULL
3858 * while we move it to the mutex queue so that it can
3859 * deal properly with spurious wakeups.
3860 */
3861 queue_unlink(qp, ulwpp, prev);
3862
3863 mp = ulwp->ul_cvmutex; /* the mutex it will acquire */
3864 ulwp->ul_cvmutex = NULL;
3865 ASSERT(mp != NULL);
3866
3867 if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3868 /* just wake it up */
3869 lwpid = ulwp->ul_lwpid;
3870 no_preempt(self);
3871 ulwp->ul_sleepq = NULL;
3872 ulwp->ul_wchan = NULL;
3873 queue_unlock(qp);
3874 (void) __lwp_unpark(lwpid);
3875 preempt(self);
3876 } else {
3877 /* move it to the mutex queue */
3878 mqp = queue_lock(mp, MX);
3879 enqueue(mqp, ulwp, 0);
3880 mp->mutex_waiters = 1;
3881 queue_unlock(mqp);
3882 queue_unlock(qp);
3883 }
3884
3885 return (error);
3886 }
3887
3888 /*
3889 * Utility function called by mutex_wakeup_all(), cond_broadcast(),
3890 * and rw_queue_release() to (re)allocate a big buffer to hold the
3891 * lwpids of all the threads to be set running after they are removed
3892 * from their sleep queues. Since we are holding a queue lock, we
3893 * cannot call any function that might acquire a lock. mmap(), munmap(),
3894 * lwp_unpark_all() are simple system calls and are safe in this regard.
3895 */
3896 lwpid_t *
alloc_lwpids(lwpid_t * lwpid,int * nlwpid_ptr,int * maxlwps_ptr)3897 alloc_lwpids(lwpid_t *lwpid, int *nlwpid_ptr, int *maxlwps_ptr)
3898 {
3899 /*
3900 * Allocate NEWLWPS ids on the first overflow.
3901 * Double the allocation each time after that.
3902 */
3903 int nlwpid = *nlwpid_ptr;
3904 int maxlwps = *maxlwps_ptr;
3905 int first_allocation;
3906 int newlwps;
3907 void *vaddr;
3908
3909 ASSERT(nlwpid == maxlwps);
3910
3911 first_allocation = (maxlwps == MAXLWPS);
3912 newlwps = first_allocation? NEWLWPS : 2 * maxlwps;
3913 vaddr = mmap(NULL, newlwps * sizeof (lwpid_t),
3914 PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
3915
3916 if (vaddr == MAP_FAILED) {
3917 /*
3918 * Let's hope this never happens.
3919 * If it does, then we have a terrible
3920 * thundering herd on our hands.
3921 */
3922 (void) __lwp_unpark_all(lwpid, nlwpid);
3923 *nlwpid_ptr = 0;
3924 } else {
3925 (void) memcpy(vaddr, lwpid, maxlwps * sizeof (lwpid_t));
3926 if (!first_allocation)
3927 (void) munmap((caddr_t)lwpid,
3928 maxlwps * sizeof (lwpid_t));
3929 lwpid = vaddr;
3930 *maxlwps_ptr = newlwps;
3931 }
3932
3933 return (lwpid);
3934 }
3935
3936 #pragma weak pthread_cond_broadcast = cond_broadcast
3937 #pragma weak _cond_broadcast = cond_broadcast
3938 int
cond_broadcast(cond_t * cvp)3939 cond_broadcast(cond_t *cvp)
3940 {
3941 ulwp_t *self = curthread;
3942 uberdata_t *udp = self->ul_uberdata;
3943 tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3944 int error = 0;
3945 queue_head_t *qp;
3946 queue_root_t *qrp;
3947 mutex_t *mp;
3948 mutex_t *mp_cache = NULL;
3949 queue_head_t *mqp = NULL;
3950 ulwp_t *ulwp;
3951 int nlwpid = 0;
3952 int maxlwps = MAXLWPS;
3953 lwpid_t buffer[MAXLWPS];
3954 lwpid_t *lwpid = buffer;
3955
3956 if (csp)
3957 tdb_incr(csp->cond_broadcast);
3958
3959 if (cvp->cond_waiters_kernel) /* someone sleeping in the kernel? */
3960 error = _lwp_cond_broadcast(cvp);
3961
3962 if (!cvp->cond_waiters_user) /* no one sleeping at user-level */
3963 return (error);
3964
3965 /*
3966 * Move everyone from the condvar sleep queue to the mutex sleep
3967 * queue for the mutex that they will acquire on being waked up.
3968 * We can do this only if we own the mutex they will acquire.
3969 * If we do not own the mutex, or if their ul_cv_wake flag
3970 * is set, just dequeue and unpark them.
3971 *
3972 * We keep track of lwpids that are to be unparked in lwpid[].
3973 * __lwp_unpark_all() is called to unpark all of them after
3974 * they have been removed from the sleep queue and the sleep
3975 * queue lock has been dropped. If we run out of space in our
3976 * on-stack buffer, we need to allocate more but we can't call
3977 * lmalloc() because we are holding a queue lock when the overflow
3978 * occurs and lmalloc() acquires a lock. We can't use alloca()
3979 * either because the application may have allocated a small
3980 * stack and we don't want to overrun the stack. So we call
3981 * alloc_lwpids() to allocate a bigger buffer using the mmap()
3982 * system call directly since that path acquires no locks.
3983 */
3984 qp = queue_lock(cvp, CV);
3985 cvp->cond_waiters_user = 0;
3986 for (;;) {
3987 if ((qrp = qp->qh_root) == NULL ||
3988 (ulwp = qrp->qr_head) == NULL)
3989 break;
3990 ASSERT(ulwp->ul_wchan == cvp);
3991 queue_unlink(qp, &qrp->qr_head, NULL);
3992 mp = ulwp->ul_cvmutex; /* its mutex */
3993 ulwp->ul_cvmutex = NULL;
3994 ASSERT(mp != NULL);
3995 if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3996 /* just wake it up */
3997 ulwp->ul_sleepq = NULL;
3998 ulwp->ul_wchan = NULL;
3999 if (nlwpid == maxlwps)
4000 lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
4001 lwpid[nlwpid++] = ulwp->ul_lwpid;
4002 } else {
4003 /* move it to the mutex queue */
4004 if (mp != mp_cache) {
4005 mp_cache = mp;
4006 if (mqp != NULL)
4007 queue_unlock(mqp);
4008 mqp = queue_lock(mp, MX);
4009 }
4010 enqueue(mqp, ulwp, 0);
4011 mp->mutex_waiters = 1;
4012 }
4013 }
4014 if (mqp != NULL)
4015 queue_unlock(mqp);
4016 if (nlwpid == 0) {
4017 queue_unlock(qp);
4018 } else {
4019 no_preempt(self);
4020 queue_unlock(qp);
4021 if (nlwpid == 1)
4022 (void) __lwp_unpark(lwpid[0]);
4023 else
4024 (void) __lwp_unpark_all(lwpid, nlwpid);
4025 preempt(self);
4026 }
4027 if (lwpid != buffer)
4028 (void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
4029 return (error);
4030 }
4031
4032 #pragma weak pthread_cond_destroy = cond_destroy
4033 int
cond_destroy(cond_t * cvp)4034 cond_destroy(cond_t *cvp)
4035 {
4036 cvp->cond_magic = 0;
4037 tdb_sync_obj_deregister(cvp);
4038 return (0);
4039 }
4040
4041 #if defined(DEBUG)
4042 void
assert_no_libc_locks_held(void)4043 assert_no_libc_locks_held(void)
4044 {
4045 ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
4046 }
4047
4048 /* protected by link_lock */
4049 uint64_t spin_lock_spin;
4050 uint64_t spin_lock_spin2;
4051 uint64_t spin_lock_sleep;
4052 uint64_t spin_lock_wakeup;
4053
4054 /*
4055 * Record spin lock statistics.
4056 * Called by a thread exiting itself in thrp_exit().
4057 * Also called via atexit() from the thread calling
4058 * exit() to do all the other threads as well.
4059 */
4060 void
record_spin_locks(ulwp_t * ulwp)4061 record_spin_locks(ulwp_t *ulwp)
4062 {
4063 spin_lock_spin += ulwp->ul_spin_lock_spin;
4064 spin_lock_spin2 += ulwp->ul_spin_lock_spin2;
4065 spin_lock_sleep += ulwp->ul_spin_lock_sleep;
4066 spin_lock_wakeup += ulwp->ul_spin_lock_wakeup;
4067 ulwp->ul_spin_lock_spin = 0;
4068 ulwp->ul_spin_lock_spin2 = 0;
4069 ulwp->ul_spin_lock_sleep = 0;
4070 ulwp->ul_spin_lock_wakeup = 0;
4071 }
4072
4073 /*
4074 * atexit function: dump the queue statistics to stderr.
4075 */
4076 #include <stdio.h>
4077 void
dump_queue_statistics(void)4078 dump_queue_statistics(void)
4079 {
4080 uberdata_t *udp = curthread->ul_uberdata;
4081 queue_head_t *qp;
4082 int qn;
4083 uint64_t spin_lock_total = 0;
4084
4085 if (udp->queue_head == NULL || thread_queue_dump == 0)
4086 return;
4087
4088 if (fprintf(stderr, "\n%5d mutex queues:\n", QHASHSIZE) < 0 ||
4089 fprintf(stderr, "queue# lockcount max qlen max hlen\n") < 0)
4090 return;
4091 for (qn = 0, qp = udp->queue_head; qn < QHASHSIZE; qn++, qp++) {
4092 if (qp->qh_lockcount == 0)
4093 continue;
4094 spin_lock_total += qp->qh_lockcount;
4095 if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
4096 (u_longlong_t)qp->qh_lockcount,
4097 qp->qh_qmax, qp->qh_hmax) < 0)
4098 return;
4099 }
4100
4101 if (fprintf(stderr, "\n%5d condvar queues:\n", QHASHSIZE) < 0 ||
4102 fprintf(stderr, "queue# lockcount max qlen max hlen\n") < 0)
4103 return;
4104 for (qn = 0; qn < QHASHSIZE; qn++, qp++) {
4105 if (qp->qh_lockcount == 0)
4106 continue;
4107 spin_lock_total += qp->qh_lockcount;
4108 if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
4109 (u_longlong_t)qp->qh_lockcount,
4110 qp->qh_qmax, qp->qh_hmax) < 0)
4111 return;
4112 }
4113
4114 (void) fprintf(stderr, "\n spin_lock_total = %10llu\n",
4115 (u_longlong_t)spin_lock_total);
4116 (void) fprintf(stderr, " spin_lock_spin = %10llu\n",
4117 (u_longlong_t)spin_lock_spin);
4118 (void) fprintf(stderr, " spin_lock_spin2 = %10llu\n",
4119 (u_longlong_t)spin_lock_spin2);
4120 (void) fprintf(stderr, " spin_lock_sleep = %10llu\n",
4121 (u_longlong_t)spin_lock_sleep);
4122 (void) fprintf(stderr, " spin_lock_wakeup = %10llu\n",
4123 (u_longlong_t)spin_lock_wakeup);
4124 }
4125 #endif
4126