xref: /illumos-gate/usr/src/lib/libc/port/threads/rwlock.c (revision 2833423dc59f4c35fe4713dbb942950c82df0437)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2016 by Delphix. All rights reserved.
25  * Copyright 2024 Oxide Computer Company
26  */
27 
28 #include "lint.h"
29 #include "thr_uberdata.h"
30 #include <sys/sdt.h>
31 
32 #define	TRY_FLAG		0x10
33 #define	READ_LOCK		0
34 #define	WRITE_LOCK		1
35 #define	READ_LOCK_TRY		(READ_LOCK | TRY_FLAG)
36 #define	WRITE_LOCK_TRY		(WRITE_LOCK | TRY_FLAG)
37 
38 #define	NLOCKS	4	/* initial number of readlock_t structs allocated */
39 
40 #define	ASSERT_CONSISTENT_STATE(readers)		\
41 	ASSERT(!((readers) & URW_WRITE_LOCKED) ||	\
42 		((readers) & ~URW_HAS_WAITERS) == URW_WRITE_LOCKED)
43 
44 /*
45  * Find/allocate an entry for rwlp in our array of rwlocks held for reading.
46  * We must be deferring signals for this to be safe.
47  * Else if we are returning an entry with ul_rdlockcnt == 0,
48  * it could be reassigned behind our back in a signal handler.
49  */
50 static readlock_t *
51 rwl_entry(rwlock_t *rwlp)
52 {
53 	ulwp_t *self = curthread;
54 	readlock_t *remembered = NULL;
55 	readlock_t *readlockp;
56 	uint_t nlocks;
57 
58 	/* we must be deferring signals */
59 	ASSERT((self->ul_critical + self->ul_sigdefer) != 0);
60 
61 	if ((nlocks = self->ul_rdlockcnt) != 0)
62 		readlockp = self->ul_readlock.array;
63 	else {
64 		nlocks = 1;
65 		readlockp = &self->ul_readlock.single;
66 	}
67 
68 	for (; nlocks; nlocks--, readlockp++) {
69 		if (readlockp->rd_rwlock == rwlp)
70 			return (readlockp);
71 		if (readlockp->rd_count == 0 && remembered == NULL)
72 			remembered = readlockp;
73 	}
74 	if (remembered != NULL) {
75 		remembered->rd_rwlock = rwlp;
76 		return (remembered);
77 	}
78 
79 	/*
80 	 * No entry available.  Allocate more space, converting the single
81 	 * readlock_t entry into an array of readlock_t entries if necessary.
82 	 */
83 	if ((nlocks = self->ul_rdlockcnt) == 0) {
84 		/*
85 		 * Initial allocation of the readlock_t array.
86 		 * Convert the single entry into an array.
87 		 */
88 		self->ul_rdlockcnt = nlocks = NLOCKS;
89 		readlockp = lmalloc(nlocks * sizeof (readlock_t));
90 		/*
91 		 * The single readlock_t becomes the first entry in the array.
92 		 */
93 		*readlockp = self->ul_readlock.single;
94 		self->ul_readlock.single.rd_count = 0;
95 		self->ul_readlock.array = readlockp;
96 		/*
97 		 * Return the next available entry in the array.
98 		 */
99 		(++readlockp)->rd_rwlock = rwlp;
100 		return (readlockp);
101 	}
102 	/*
103 	 * Reallocate the array, double the size each time.
104 	 */
105 	readlockp = lmalloc(nlocks * 2 * sizeof (readlock_t));
106 	(void) memcpy(readlockp, self->ul_readlock.array,
107 	    nlocks * sizeof (readlock_t));
108 	lfree(self->ul_readlock.array, nlocks * sizeof (readlock_t));
109 	self->ul_readlock.array = readlockp;
110 	self->ul_rdlockcnt *= 2;
111 	/*
112 	 * Return the next available entry in the newly allocated array.
113 	 */
114 	(readlockp += nlocks)->rd_rwlock = rwlp;
115 	return (readlockp);
116 }
117 
118 /*
119  * Free the array of rwlocks held for reading.
120  */
121 void
122 rwl_free(ulwp_t *ulwp)
123 {
124 	uint_t nlocks;
125 
126 	if ((nlocks = ulwp->ul_rdlockcnt) != 0)
127 		lfree(ulwp->ul_readlock.array, nlocks * sizeof (readlock_t));
128 	ulwp->ul_rdlockcnt = 0;
129 	ulwp->ul_readlock.single.rd_rwlock = NULL;
130 	ulwp->ul_readlock.single.rd_count = 0;
131 }
132 
133 /*
134  * Check if a reader version of the lock is held by the current thread.
135  */
136 #pragma weak _rw_read_held = rw_read_held
137 int
138 rw_read_held(rwlock_t *rwlp)
139 {
140 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
141 	uint32_t readers;
142 	ulwp_t *self = curthread;
143 	readlock_t *readlockp;
144 	uint_t nlocks;
145 	int rval = 0;
146 
147 	no_preempt(self);
148 
149 	readers = *rwstate;
150 	ASSERT_CONSISTENT_STATE(readers);
151 	if (!(readers & URW_WRITE_LOCKED) &&
152 	    (readers & URW_READERS_MASK) != 0) {
153 		/*
154 		 * The lock is held for reading by some thread.
155 		 * Search our array of rwlocks held for reading for a match.
156 		 */
157 		if ((nlocks = self->ul_rdlockcnt) != 0)
158 			readlockp = self->ul_readlock.array;
159 		else {
160 			nlocks = 1;
161 			readlockp = &self->ul_readlock.single;
162 		}
163 		for (; nlocks; nlocks--, readlockp++) {
164 			if (readlockp->rd_rwlock == rwlp) {
165 				if (readlockp->rd_count)
166 					rval = 1;
167 				break;
168 			}
169 		}
170 	}
171 
172 	preempt(self);
173 	return (rval);
174 }
175 
176 /*
177  * Check if a writer version of the lock is held by the current thread.
178  */
179 #pragma weak _rw_write_held = rw_write_held
180 int
181 rw_write_held(rwlock_t *rwlp)
182 {
183 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
184 	uint32_t readers;
185 	ulwp_t *self = curthread;
186 	int rval;
187 
188 	no_preempt(self);
189 
190 	readers = *rwstate;
191 	ASSERT_CONSISTENT_STATE(readers);
192 	rval = ((readers & URW_WRITE_LOCKED) &&
193 	    rwlp->rwlock_owner == (uintptr_t)self &&
194 	    (rwlp->rwlock_type == USYNC_THREAD ||
195 	    rwlp->rwlock_ownerpid == self->ul_uberdata->pid));
196 
197 	preempt(self);
198 	return (rval);
199 }
200 
201 #pragma weak _rwlock_init = rwlock_init
202 int
203 rwlock_init(rwlock_t *rwlp, int type, void *arg __unused)
204 {
205 	ulwp_t *self = curthread;
206 
207 	if (type != USYNC_THREAD && type != USYNC_PROCESS)
208 		return (EINVAL);
209 	/*
210 	 * Once reinitialized, we can no longer be holding a read or write lock.
211 	 * We can do nothing about other threads that are holding read locks.
212 	 */
213 	sigoff(self);
214 	rwl_entry(rwlp)->rd_count = 0;
215 	sigon(self);
216 	(void) memset(rwlp, 0, sizeof (*rwlp));
217 	rwlp->rwlock_type = (uint16_t)type;
218 	rwlp->rwlock_magic = RWL_MAGIC;
219 	rwlp->mutex.mutex_type = (uint8_t)type;
220 	rwlp->mutex.mutex_flag = LOCK_INITED;
221 	rwlp->mutex.mutex_magic = MUTEX_MAGIC;
222 
223 	/*
224 	 * This should be at the beginning of the function,
225 	 * but for the sake of old broken applications that
226 	 * do not have proper alignment for their rwlocks
227 	 * (and don't check the return code from rwlock_init),
228 	 * we put it here, after initializing the rwlock regardless.
229 	 */
230 	if (((uintptr_t)rwlp & (_LONG_LONG_ALIGNMENT - 1)) &&
231 	    self->ul_misaligned == 0)
232 		return (EINVAL);
233 
234 	return (0);
235 }
236 
237 #pragma weak pthread_rwlock_destroy = rwlock_destroy
238 #pragma weak _rwlock_destroy = rwlock_destroy
239 int
240 rwlock_destroy(rwlock_t *rwlp)
241 {
242 	ulwp_t *self = curthread;
243 
244 	/*
245 	 * Once destroyed, we can no longer be holding a read or write lock.
246 	 * We can do nothing about other threads that are holding read locks.
247 	 */
248 	sigoff(self);
249 	rwl_entry(rwlp)->rd_count = 0;
250 	sigon(self);
251 	rwlp->rwlock_magic = 0;
252 	tdb_sync_obj_deregister(rwlp);
253 	return (0);
254 }
255 
256 /*
257  * The following four functions:
258  *	read_lock_try()
259  *	read_unlock_try()
260  *	write_lock_try()
261  *	write_unlock_try()
262  * lie at the heart of the fast-path code for rwlocks,
263  * both process-private and process-shared.
264  *
265  * They are called once without recourse to any other locking primitives.
266  * If they succeed, we are done and the fast-path code was successful.
267  * If they fail, we have to deal with lock queues, either to enqueue
268  * ourself and sleep or to dequeue and wake up someone else (slow paths).
269  *
270  * Unless 'ignore_waiters_flag' is true (a condition that applies only
271  * when read_lock_try() or write_lock_try() is called from code that
272  * is already in the slow path and has already acquired the queue lock),
273  * these functions will always fail if the waiters flag, URW_HAS_WAITERS,
274  * is set in the 'rwstate' word.  Thus, setting the waiters flag on the
275  * rwlock and acquiring the queue lock guarantees exclusive access to
276  * the rwlock (and is the only way to guarantee exclusive access).
277  */
278 
279 /*
280  * Attempt to acquire a readers lock.  Return true on success.
281  */
282 static int
283 read_lock_try(rwlock_t *rwlp, int ignore_waiters_flag)
284 {
285 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
286 	uint32_t mask = ignore_waiters_flag?
287 	    URW_WRITE_LOCKED : (URW_HAS_WAITERS | URW_WRITE_LOCKED);
288 	uint32_t readers;
289 	ulwp_t *self = curthread;
290 
291 	no_preempt(self);
292 	while (((readers = *rwstate) & mask) == 0) {
293 		if (atomic_cas_32(rwstate, readers, readers + 1) == readers) {
294 			preempt(self);
295 			return (1);
296 		}
297 	}
298 	preempt(self);
299 	return (0);
300 }
301 
302 /*
303  * Attempt to release a reader lock.  Return true on success.
304  */
305 static int
306 read_unlock_try(rwlock_t *rwlp)
307 {
308 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
309 	uint32_t readers;
310 	ulwp_t *self = curthread;
311 
312 	no_preempt(self);
313 	while (((readers = *rwstate) & URW_HAS_WAITERS) == 0) {
314 		if (atomic_cas_32(rwstate, readers, readers - 1) == readers) {
315 			preempt(self);
316 			return (1);
317 		}
318 	}
319 	preempt(self);
320 	return (0);
321 }
322 
323 /*
324  * Attempt to acquire a writer lock.  Return true on success.
325  */
326 static int
327 write_lock_try(rwlock_t *rwlp, int ignore_waiters_flag)
328 {
329 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
330 	uint32_t mask = ignore_waiters_flag?
331 	    (URW_WRITE_LOCKED | URW_READERS_MASK) :
332 	    (URW_HAS_WAITERS | URW_WRITE_LOCKED | URW_READERS_MASK);
333 	ulwp_t *self = curthread;
334 	uint32_t readers;
335 
336 	no_preempt(self);
337 	while (((readers = *rwstate) & mask) == 0) {
338 		if (atomic_cas_32(rwstate, readers, readers | URW_WRITE_LOCKED)
339 		    == readers) {
340 			preempt(self);
341 			return (1);
342 		}
343 	}
344 	preempt(self);
345 	return (0);
346 }
347 
348 /*
349  * Attempt to release a writer lock.  Return true on success.
350  */
351 static int
352 write_unlock_try(rwlock_t *rwlp)
353 {
354 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
355 	uint32_t readers;
356 	ulwp_t *self = curthread;
357 
358 	no_preempt(self);
359 	while (((readers = *rwstate) & URW_HAS_WAITERS) == 0) {
360 		if (atomic_cas_32(rwstate, readers, 0) == readers) {
361 			preempt(self);
362 			return (1);
363 		}
364 	}
365 	preempt(self);
366 	return (0);
367 }
368 
369 /*
370  * Release a process-private rwlock and wake up any thread(s) sleeping on it.
371  * This is called when a thread releases a lock that appears to have waiters.
372  */
373 static void
374 rw_queue_release(rwlock_t *rwlp)
375 {
376 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
377 	queue_head_t *qp;
378 	uint32_t readers;
379 	uint32_t writer;
380 	ulwp_t **ulwpp;
381 	ulwp_t *ulwp;
382 	ulwp_t *prev;
383 	int nlwpid = 0;
384 	int more;
385 	int maxlwps = MAXLWPS;
386 	lwpid_t buffer[MAXLWPS];
387 	lwpid_t *lwpid = buffer;
388 
389 	qp = queue_lock(rwlp, MX);
390 
391 	/*
392 	 * Here is where we actually drop the lock,
393 	 * but we retain the URW_HAS_WAITERS flag, if it is already set.
394 	 */
395 	readers = *rwstate;
396 	ASSERT_CONSISTENT_STATE(readers);
397 	if (readers & URW_WRITE_LOCKED)	/* drop the writer lock */
398 		atomic_and_32(rwstate, ~URW_WRITE_LOCKED);
399 	else				/* drop the readers lock */
400 		atomic_dec_32(rwstate);
401 	if (!(readers & URW_HAS_WAITERS)) {	/* no waiters */
402 		queue_unlock(qp);
403 		return;
404 	}
405 
406 	/*
407 	 * The presence of the URW_HAS_WAITERS flag causes all rwlock
408 	 * code to go through the slow path, acquiring queue_lock(qp).
409 	 * Therefore, the rest of this code is safe because we are
410 	 * holding the queue lock and the URW_HAS_WAITERS flag is set.
411 	 */
412 
413 	readers = *rwstate;		/* must fetch the value again */
414 	ASSERT_CONSISTENT_STATE(readers);
415 	ASSERT(readers & URW_HAS_WAITERS);
416 	readers &= URW_READERS_MASK;	/* count of current readers */
417 	writer = 0;			/* no current writer */
418 
419 	/*
420 	 * Examine the queue of waiters in priority order and prepare
421 	 * to wake up as many readers as we encounter before encountering
422 	 * a writer.  If the highest priority thread on the queue is a
423 	 * writer, stop there and wake it up.
424 	 *
425 	 * We keep track of lwpids that are to be unparked in lwpid[].
426 	 * __lwp_unpark_all() is called to unpark all of them after
427 	 * they have been removed from the sleep queue and the sleep
428 	 * queue lock has been dropped.  If we run out of space in our
429 	 * on-stack buffer, we need to allocate more but we can't call
430 	 * lmalloc() because we are holding a queue lock when the overflow
431 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
432 	 * either because the application may have allocated a small
433 	 * stack and we don't want to overrun the stack.  So we call
434 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
435 	 * system call directly since that path acquires no locks.
436 	 */
437 	while ((ulwpp = queue_slot(qp, &prev, &more)) != NULL) {
438 		ulwp = *ulwpp;
439 		ASSERT(ulwp->ul_wchan == rwlp);
440 		if (ulwp->ul_writer) {
441 			if (writer != 0 || readers != 0)
442 				break;
443 			/* one writer to wake */
444 			writer++;
445 		} else {
446 			if (writer != 0)
447 				break;
448 			/* at least one reader to wake */
449 			readers++;
450 			if (nlwpid == maxlwps)
451 				lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
452 		}
453 		queue_unlink(qp, ulwpp, prev);
454 		ulwp->ul_sleepq = NULL;
455 		ulwp->ul_wchan = NULL;
456 		if (writer) {
457 			/*
458 			 * Hand off the lock to the writer we will be waking.
459 			 */
460 			ASSERT((*rwstate & ~URW_HAS_WAITERS) == 0);
461 			atomic_or_32(rwstate, URW_WRITE_LOCKED);
462 			rwlp->rwlock_owner = (uintptr_t)ulwp;
463 		}
464 		lwpid[nlwpid++] = ulwp->ul_lwpid;
465 	}
466 
467 	/*
468 	 * This modification of rwstate must be done last.
469 	 * The presence of the URW_HAS_WAITERS flag causes all rwlock
470 	 * code to go through the slow path, acquiring queue_lock(qp).
471 	 * Otherwise the read_lock_try() and write_lock_try() fast paths
472 	 * are effective.
473 	 */
474 	if (ulwpp == NULL)
475 		atomic_and_32(rwstate, ~URW_HAS_WAITERS);
476 
477 	if (nlwpid == 0) {
478 		queue_unlock(qp);
479 	} else {
480 		ulwp_t *self = curthread;
481 		no_preempt(self);
482 		queue_unlock(qp);
483 		if (nlwpid == 1)
484 			(void) __lwp_unpark(lwpid[0]);
485 		else
486 			(void) __lwp_unpark_all(lwpid, nlwpid);
487 		preempt(self);
488 	}
489 	if (lwpid != buffer)
490 		(void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
491 }
492 
493 /*
494  * Common code for rdlock, timedrdlock, wrlock, timedwrlock, tryrdlock,
495  * and trywrlock for process-shared (USYNC_PROCESS) rwlocks.
496  *
497  * Note: if the lock appears to be contended we call __lwp_rwlock_rdlock()
498  * or __lwp_rwlock_wrlock() holding the mutex. These return with the mutex
499  * released, and if they need to sleep will release the mutex first. In the
500  * event of a spurious wakeup, these will return EAGAIN (because it is much
501  * easier for us to re-acquire the mutex here).
502  */
503 int
504 shared_rwlock_lock(rwlock_t *rwlp, timespec_t *tsp, int rd_wr)
505 {
506 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
507 	mutex_t *mp = &rwlp->mutex;
508 	int try_flag;
509 	int error;
510 
511 	try_flag = (rd_wr & TRY_FLAG);
512 	rd_wr &= ~TRY_FLAG;
513 	ASSERT(rd_wr == READ_LOCK || rd_wr == WRITE_LOCK);
514 
515 	if (!try_flag) {
516 		DTRACE_PROBE2(plockstat, rw__block, rwlp, rd_wr);
517 	}
518 
519 	do {
520 		if (try_flag && (*rwstate & URW_WRITE_LOCKED)) {
521 			error = EBUSY;
522 			break;
523 		}
524 		if ((error = mutex_lock(mp)) != 0)
525 			break;
526 		if (rd_wr == READ_LOCK) {
527 			if (read_lock_try(rwlp, 0)) {
528 				(void) mutex_unlock(mp);
529 				break;
530 			}
531 		} else {
532 			if (write_lock_try(rwlp, 0)) {
533 				(void) mutex_unlock(mp);
534 				break;
535 			}
536 		}
537 		atomic_or_32(rwstate, URW_HAS_WAITERS);
538 
539 #ifdef DEBUG
540 		uint32_t readers;
541 		readers = *rwstate;
542 		ASSERT_CONSISTENT_STATE(readers);
543 #endif
544 		/*
545 		 * The calls to __lwp_rwlock_*() below will release the mutex,
546 		 * so we need a dtrace probe here.  The owner field of the
547 		 * mutex is cleared in the kernel when the mutex is released,
548 		 * so we should not clear it here.
549 		 */
550 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
551 		/*
552 		 * The waiters bit may be inaccurate.
553 		 * Only the kernel knows for sure.
554 		 */
555 		if (rd_wr == READ_LOCK) {
556 			if (try_flag)
557 				error = __lwp_rwlock_tryrdlock(rwlp);
558 			else
559 				error = __lwp_rwlock_rdlock(rwlp, tsp);
560 		} else {
561 			if (try_flag)
562 				error = __lwp_rwlock_trywrlock(rwlp);
563 			else
564 				error = __lwp_rwlock_wrlock(rwlp, tsp);
565 		}
566 	} while (error == EAGAIN || error == EINTR);
567 
568 	if (!try_flag) {
569 		DTRACE_PROBE3(plockstat, rw__blocked, rwlp, rd_wr, error == 0);
570 	}
571 
572 	return (error);
573 }
574 
575 /*
576  * Common code for rdlock, timedrdlock, wrlock, timedwrlock, tryrdlock,
577  * and trywrlock for process-private (USYNC_THREAD) rwlocks.
578  */
579 int
580 rwlock_lock(rwlock_t *rwlp, timespec_t *tsp, int rd_wr)
581 {
582 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
583 	uint32_t readers;
584 	ulwp_t *self = curthread;
585 	queue_head_t *qp;
586 	ulwp_t *ulwp;
587 	int try_flag;
588 	int ignore_waiters_flag;
589 	int error = 0;
590 
591 	try_flag = (rd_wr & TRY_FLAG);
592 	rd_wr &= ~TRY_FLAG;
593 	ASSERT(rd_wr == READ_LOCK || rd_wr == WRITE_LOCK);
594 
595 	if (!try_flag) {
596 		DTRACE_PROBE2(plockstat, rw__block, rwlp, rd_wr);
597 	}
598 
599 	qp = queue_lock(rwlp, MX);
600 	/* initial attempt to acquire the lock fails if there are waiters */
601 	ignore_waiters_flag = 0;
602 	while (error == 0) {
603 		if (rd_wr == READ_LOCK) {
604 			if (read_lock_try(rwlp, ignore_waiters_flag))
605 				break;
606 		} else {
607 			if (write_lock_try(rwlp, ignore_waiters_flag))
608 				break;
609 		}
610 		/* subsequent attempts do not fail due to waiters */
611 		ignore_waiters_flag = 1;
612 		atomic_or_32(rwstate, URW_HAS_WAITERS);
613 		readers = *rwstate;
614 		ASSERT_CONSISTENT_STATE(readers);
615 		if ((readers & URW_WRITE_LOCKED) ||
616 		    (rd_wr == WRITE_LOCK &&
617 		    (readers & URW_READERS_MASK) != 0))
618 			/* EMPTY */;	/* somebody holds the lock */
619 		else if ((ulwp = queue_waiter(qp)) == NULL) {
620 			atomic_and_32(rwstate, ~URW_HAS_WAITERS);
621 			ignore_waiters_flag = 0;
622 			continue;	/* no queued waiters, start over */
623 		} else {
624 			/*
625 			 * Do a priority check on the queued waiter (the
626 			 * highest priority thread on the queue) to see
627 			 * if we should defer to it or just grab the lock.
628 			 */
629 			int our_pri = real_priority(self);
630 			int his_pri = real_priority(ulwp);
631 
632 			if (rd_wr == WRITE_LOCK) {
633 				/*
634 				 * We defer to a queued thread that has
635 				 * a higher priority than ours.
636 				 */
637 				if (his_pri <= our_pri) {
638 					/*
639 					 * Don't defer, just grab the lock.
640 					 */
641 					continue;
642 				}
643 			} else {
644 				/*
645 				 * We defer to a queued thread that has
646 				 * a higher priority than ours or that
647 				 * is a writer whose priority equals ours.
648 				 */
649 				if (his_pri < our_pri ||
650 				    (his_pri == our_pri && !ulwp->ul_writer)) {
651 					/*
652 					 * Don't defer, just grab the lock.
653 					 */
654 					continue;
655 				}
656 			}
657 		}
658 		/*
659 		 * We are about to block.
660 		 * If we're doing a trylock, return EBUSY instead.
661 		 */
662 		if (try_flag) {
663 			error = EBUSY;
664 			break;
665 		}
666 		/*
667 		 * Enqueue writers ahead of readers.
668 		 */
669 		self->ul_writer = rd_wr;	/* *must* be 0 or 1 */
670 		enqueue(qp, self, 0);
671 		set_parking_flag(self, 1);
672 		queue_unlock(qp);
673 		if ((error = __lwp_park(tsp, 0)) == EINTR)
674 			error = 0;
675 		set_parking_flag(self, 0);
676 		qp = queue_lock(rwlp, MX);
677 		if (self->ul_sleepq && dequeue_self(qp) == 0) {
678 			atomic_and_32(rwstate, ~URW_HAS_WAITERS);
679 			ignore_waiters_flag = 0;
680 		}
681 		self->ul_writer = 0;
682 		if (rd_wr == WRITE_LOCK &&
683 		    (*rwstate & URW_WRITE_LOCKED) &&
684 		    rwlp->rwlock_owner == (uintptr_t)self) {
685 			/*
686 			 * We acquired the lock by hand-off
687 			 * from the previous owner,
688 			 */
689 			error = 0;	/* timedlock did not fail */
690 			break;
691 		}
692 	}
693 
694 	/*
695 	 * Make one final check to see if there are any threads left
696 	 * on the rwlock queue.  Clear the URW_HAS_WAITERS flag if not.
697 	 */
698 	if (qp->qh_root == NULL || qp->qh_root->qr_head == NULL)
699 		atomic_and_32(rwstate, ~URW_HAS_WAITERS);
700 
701 	queue_unlock(qp);
702 
703 	if (!try_flag) {
704 		DTRACE_PROBE3(plockstat, rw__blocked, rwlp, rd_wr, error == 0);
705 	}
706 
707 	return (error);
708 }
709 
710 int
711 rw_rdlock_impl(rwlock_t *rwlp, timespec_t *tsp)
712 {
713 	ulwp_t *self = curthread;
714 	uberdata_t *udp = self->ul_uberdata;
715 	readlock_t *readlockp;
716 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
717 	int error;
718 
719 	/*
720 	 * If we already hold a readers lock on this rwlock,
721 	 * just increment our reference count and return.
722 	 */
723 	sigoff(self);
724 	readlockp = rwl_entry(rwlp);
725 	if (readlockp->rd_count != 0) {
726 		if (readlockp->rd_count == READ_LOCK_MAX) {
727 			sigon(self);
728 			error = EAGAIN;
729 			goto out;
730 		}
731 		sigon(self);
732 		error = 0;
733 		goto out;
734 	}
735 	sigon(self);
736 
737 	/*
738 	 * If we hold the writer lock, bail out.
739 	 */
740 	if (rw_write_held(rwlp)) {
741 		if (self->ul_error_detection)
742 			rwlock_error(rwlp, "rwlock_rdlock",
743 			    "calling thread owns the writer lock");
744 		error = EDEADLK;
745 		goto out;
746 	}
747 
748 	if (read_lock_try(rwlp, 0))
749 		error = 0;
750 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
751 		error = shared_rwlock_lock(rwlp, tsp, READ_LOCK);
752 	else						/* user-level */
753 		error = rwlock_lock(rwlp, tsp, READ_LOCK);
754 
755 out:
756 	if (error == 0) {
757 		sigoff(self);
758 		rwl_entry(rwlp)->rd_count++;
759 		sigon(self);
760 		if (rwsp)
761 			tdb_incr(rwsp->rw_rdlock);
762 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, READ_LOCK);
763 	} else {
764 		DTRACE_PROBE3(plockstat, rw__error, rwlp, READ_LOCK, error);
765 	}
766 
767 	return (error);
768 }
769 
770 #pragma weak pthread_rwlock_rdlock = rw_rdlock
771 #pragma weak _rw_rdlock = rw_rdlock
772 int
773 rw_rdlock(rwlock_t *rwlp)
774 {
775 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
776 	return (rw_rdlock_impl(rwlp, NULL));
777 }
778 
779 void
780 lrw_rdlock(rwlock_t *rwlp)
781 {
782 	enter_critical(curthread);
783 	(void) rw_rdlock_impl(rwlp, NULL);
784 }
785 
786 int
787 pthread_rwlock_relclockrdlock_np(pthread_rwlock_t *restrict rwlp,
788     clockid_t clock, const struct timespec *restrict reltime)
789 {
790 	timespec_t tslocal = *reltime;
791 	int error;
792 
793 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
794 
795 	switch (clock) {
796 	case CLOCK_REALTIME:
797 	case CLOCK_HIGHRES:
798 		break;
799 	default:
800 		return (EINVAL);
801 	}
802 
803 	error = rw_rdlock_impl((rwlock_t *)rwlp, &tslocal);
804 	if (error == ETIME)
805 		error = ETIMEDOUT;
806 	return (error);
807 }
808 
809 int
810 pthread_rwlock_reltimedrdlock_np(pthread_rwlock_t *restrict rwlp,
811     const struct timespec *restrict reltime)
812 {
813 	return (pthread_rwlock_relclockrdlock_np(rwlp, CLOCK_REALTIME,
814 	    reltime));
815 }
816 
817 int
818 pthread_rwlock_clockrdlock(pthread_rwlock_t *restrict rwlp, clockid_t clock,
819     const struct timespec *restrict abstime)
820 {
821 	timespec_t tslocal;
822 	int error;
823 
824 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
825 
826 	switch (clock) {
827 	case CLOCK_REALTIME:
828 	case CLOCK_HIGHRES:
829 		break;
830 	default:
831 		return (EINVAL);
832 	}
833 
834 	abstime_to_reltime(clock, abstime, &tslocal);
835 	error = rw_rdlock_impl((rwlock_t *)rwlp, &tslocal);
836 	if (error == ETIME)
837 		error = ETIMEDOUT;
838 	return (error);
839 }
840 
841 int
842 pthread_rwlock_timedrdlock(pthread_rwlock_t *restrict rwlp,
843     const struct timespec *restrict abstime)
844 {
845 	return (pthread_rwlock_clockrdlock(rwlp, CLOCK_REALTIME, abstime));
846 }
847 
848 int
849 rw_wrlock_impl(rwlock_t *rwlp, timespec_t *tsp)
850 {
851 	ulwp_t *self = curthread;
852 	uberdata_t *udp = self->ul_uberdata;
853 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
854 	int error;
855 
856 	/*
857 	 * If we hold a readers lock on this rwlock, bail out.
858 	 */
859 	if (rw_read_held(rwlp)) {
860 		if (self->ul_error_detection)
861 			rwlock_error(rwlp, "rwlock_wrlock",
862 			    "calling thread owns the readers lock");
863 		error = EDEADLK;
864 		goto out;
865 	}
866 
867 	/*
868 	 * If we hold the writer lock, bail out.
869 	 */
870 	if (rw_write_held(rwlp)) {
871 		if (self->ul_error_detection)
872 			rwlock_error(rwlp, "rwlock_wrlock",
873 			    "calling thread owns the writer lock");
874 		error = EDEADLK;
875 		goto out;
876 	}
877 
878 	if (write_lock_try(rwlp, 0))
879 		error = 0;
880 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
881 		error = shared_rwlock_lock(rwlp, tsp, WRITE_LOCK);
882 	else						/* user-level */
883 		error = rwlock_lock(rwlp, tsp, WRITE_LOCK);
884 
885 out:
886 	if (error == 0) {
887 		rwlp->rwlock_owner = (uintptr_t)self;
888 		if (rwlp->rwlock_type == USYNC_PROCESS)
889 			rwlp->rwlock_ownerpid = udp->pid;
890 		if (rwsp) {
891 			tdb_incr(rwsp->rw_wrlock);
892 			rwsp->rw_wrlock_begin_hold = gethrtime();
893 		}
894 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, WRITE_LOCK);
895 	} else {
896 		DTRACE_PROBE3(plockstat, rw__error, rwlp, WRITE_LOCK, error);
897 	}
898 	return (error);
899 }
900 
901 #pragma weak pthread_rwlock_wrlock = rw_wrlock
902 #pragma weak _rw_wrlock = rw_wrlock
903 int
904 rw_wrlock(rwlock_t *rwlp)
905 {
906 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
907 	return (rw_wrlock_impl(rwlp, NULL));
908 }
909 
910 void
911 lrw_wrlock(rwlock_t *rwlp)
912 {
913 	enter_critical(curthread);
914 	(void) rw_wrlock_impl(rwlp, NULL);
915 }
916 
917 int
918 pthread_rwlock_relclockwrlock_np(pthread_rwlock_t *restrict rwlp,
919     clockid_t clock, const struct timespec *restrict reltime)
920 {
921 	timespec_t tslocal = *reltime;
922 	int error;
923 
924 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
925 
926 	switch (clock) {
927 	case CLOCK_REALTIME:
928 	case CLOCK_HIGHRES:
929 		break;
930 	default:
931 		return (EINVAL);
932 	}
933 
934 	error = rw_wrlock_impl((rwlock_t *)rwlp, &tslocal);
935 	if (error == ETIME)
936 		error = ETIMEDOUT;
937 	return (error);
938 }
939 
940 int
941 pthread_rwlock_reltimedwrlock_np(pthread_rwlock_t *restrict rwlp,
942     const struct timespec *restrict reltime)
943 {
944 	return (pthread_rwlock_relclockwrlock_np(rwlp, CLOCK_REALTIME,
945 	    reltime));
946 }
947 
948 int
949 pthread_rwlock_clockwrlock(pthread_rwlock_t *rwlp, clockid_t clock,
950     const timespec_t *abstime)
951 {
952 	timespec_t tslocal;
953 	int error;
954 
955 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
956 
957 	switch (clock) {
958 	case CLOCK_REALTIME:
959 	case CLOCK_HIGHRES:
960 		break;
961 	default:
962 		return (EINVAL);
963 	}
964 
965 	abstime_to_reltime(clock, abstime, &tslocal);
966 	error = rw_wrlock_impl((rwlock_t *)rwlp, &tslocal);
967 	if (error == ETIME)
968 		error = ETIMEDOUT;
969 	return (error);
970 }
971 
972 int
973 pthread_rwlock_timedwrlock(pthread_rwlock_t *rwlp, const timespec_t *abstime)
974 {
975 	return (pthread_rwlock_clockwrlock(rwlp, CLOCK_REALTIME, abstime));
976 }
977 
978 #pragma weak pthread_rwlock_tryrdlock = rw_tryrdlock
979 int
980 rw_tryrdlock(rwlock_t *rwlp)
981 {
982 	ulwp_t *self = curthread;
983 	uberdata_t *udp = self->ul_uberdata;
984 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
985 	readlock_t *readlockp;
986 	int error;
987 
988 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
989 
990 	if (rwsp)
991 		tdb_incr(rwsp->rw_rdlock_try);
992 
993 	/*
994 	 * If we already hold a readers lock on this rwlock,
995 	 * just increment our reference count and return.
996 	 */
997 	sigoff(self);
998 	readlockp = rwl_entry(rwlp);
999 	if (readlockp->rd_count != 0) {
1000 		if (readlockp->rd_count == READ_LOCK_MAX) {
1001 			sigon(self);
1002 			error = EAGAIN;
1003 			goto out;
1004 		}
1005 		sigon(self);
1006 		error = 0;
1007 		goto out;
1008 	}
1009 	sigon(self);
1010 
1011 	if (read_lock_try(rwlp, 0))
1012 		error = 0;
1013 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
1014 		error = shared_rwlock_lock(rwlp, NULL, READ_LOCK_TRY);
1015 	else						/* user-level */
1016 		error = rwlock_lock(rwlp, NULL, READ_LOCK_TRY);
1017 
1018 out:
1019 	if (error == 0) {
1020 		sigoff(self);
1021 		rwl_entry(rwlp)->rd_count++;
1022 		sigon(self);
1023 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, READ_LOCK);
1024 	} else {
1025 		if (rwsp)
1026 			tdb_incr(rwsp->rw_rdlock_try_fail);
1027 		if (error != EBUSY) {
1028 			DTRACE_PROBE3(plockstat, rw__error, rwlp, READ_LOCK,
1029 			    error);
1030 		}
1031 	}
1032 
1033 	return (error);
1034 }
1035 
1036 #pragma weak pthread_rwlock_trywrlock = rw_trywrlock
1037 int
1038 rw_trywrlock(rwlock_t *rwlp)
1039 {
1040 	ulwp_t *self = curthread;
1041 	uberdata_t *udp = self->ul_uberdata;
1042 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
1043 	int error;
1044 
1045 	ASSERT(!self->ul_critical || self->ul_bindflags);
1046 
1047 	if (rwsp)
1048 		tdb_incr(rwsp->rw_wrlock_try);
1049 
1050 	if (write_lock_try(rwlp, 0))
1051 		error = 0;
1052 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
1053 		error = shared_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY);
1054 	else						/* user-level */
1055 		error = rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY);
1056 
1057 	if (error == 0) {
1058 		rwlp->rwlock_owner = (uintptr_t)self;
1059 		if (rwlp->rwlock_type == USYNC_PROCESS)
1060 			rwlp->rwlock_ownerpid = udp->pid;
1061 		if (rwsp)
1062 			rwsp->rw_wrlock_begin_hold = gethrtime();
1063 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, WRITE_LOCK);
1064 	} else {
1065 		if (rwsp)
1066 			tdb_incr(rwsp->rw_wrlock_try_fail);
1067 		if (error != EBUSY) {
1068 			DTRACE_PROBE3(plockstat, rw__error, rwlp, WRITE_LOCK,
1069 			    error);
1070 		}
1071 	}
1072 	return (error);
1073 }
1074 
1075 #pragma weak pthread_rwlock_unlock = rw_unlock
1076 #pragma weak _rw_unlock = rw_unlock
1077 int
1078 rw_unlock(rwlock_t *rwlp)
1079 {
1080 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
1081 	uint32_t readers;
1082 	ulwp_t *self = curthread;
1083 	uberdata_t *udp = self->ul_uberdata;
1084 	tdb_rwlock_stats_t *rwsp;
1085 	int rd_wr;
1086 
1087 	readers = *rwstate;
1088 	ASSERT_CONSISTENT_STATE(readers);
1089 	if (readers & URW_WRITE_LOCKED) {
1090 		rd_wr = WRITE_LOCK;
1091 		readers = 0;
1092 	} else {
1093 		rd_wr = READ_LOCK;
1094 		readers &= URW_READERS_MASK;
1095 	}
1096 
1097 	if (rd_wr == WRITE_LOCK) {
1098 		/*
1099 		 * Since the writer lock is held, we'd better be
1100 		 * holding it, else we cannot legitimately be here.
1101 		 */
1102 		if (!rw_write_held(rwlp)) {
1103 			if (self->ul_error_detection)
1104 				rwlock_error(rwlp, "rwlock_unlock",
1105 				    "writer lock held, "
1106 				    "but not by the calling thread");
1107 			return (EPERM);
1108 		}
1109 		if ((rwsp = RWLOCK_STATS(rwlp, udp)) != NULL) {
1110 			if (rwsp->rw_wrlock_begin_hold)
1111 				rwsp->rw_wrlock_hold_time +=
1112 				    gethrtime() - rwsp->rw_wrlock_begin_hold;
1113 			rwsp->rw_wrlock_begin_hold = 0;
1114 		}
1115 		rwlp->rwlock_owner = 0;
1116 		rwlp->rwlock_ownerpid = 0;
1117 	} else if (readers > 0) {
1118 		/*
1119 		 * A readers lock is held; if we don't hold one, bail out.
1120 		 */
1121 		readlock_t *readlockp;
1122 
1123 		sigoff(self);
1124 		readlockp = rwl_entry(rwlp);
1125 		if (readlockp->rd_count == 0) {
1126 			sigon(self);
1127 			if (self->ul_error_detection)
1128 				rwlock_error(rwlp, "rwlock_unlock",
1129 				    "readers lock held, "
1130 				    "but not by the calling thread");
1131 			return (EPERM);
1132 		}
1133 		/*
1134 		 * If we hold more than one readers lock on this rwlock,
1135 		 * just decrement our reference count and return.
1136 		 */
1137 		if (--readlockp->rd_count != 0) {
1138 			sigon(self);
1139 			goto out;
1140 		}
1141 		sigon(self);
1142 	} else {
1143 		/*
1144 		 * This is a usage error.
1145 		 * No thread should release an unowned lock.
1146 		 */
1147 		if (self->ul_error_detection)
1148 			rwlock_error(rwlp, "rwlock_unlock", "lock not owned");
1149 		return (EPERM);
1150 	}
1151 
1152 	if (rd_wr == WRITE_LOCK && write_unlock_try(rwlp)) {
1153 		/* EMPTY */;
1154 	} else if (rd_wr == READ_LOCK && read_unlock_try(rwlp)) {
1155 		/* EMPTY */;
1156 	} else if (rwlp->rwlock_type == USYNC_PROCESS) {
1157 		(void) mutex_lock(&rwlp->mutex);
1158 		(void) __lwp_rwlock_unlock(rwlp);
1159 		(void) mutex_unlock(&rwlp->mutex);
1160 	} else {
1161 		rw_queue_release(rwlp);
1162 	}
1163 
1164 out:
1165 	DTRACE_PROBE2(plockstat, rw__release, rwlp, rd_wr);
1166 	return (0);
1167 }
1168 
1169 void
1170 lrw_unlock(rwlock_t *rwlp)
1171 {
1172 	(void) rw_unlock(rwlp);
1173 	exit_critical(curthread);
1174 }
1175