xref: /illumos-gate/usr/src/lib/libc/port/threads/rwlock.c (revision ddb365bfc9e868ad24ccdcb0dc91af18b10df082)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2016 by Delphix. All rights reserved.
25  */
26 
27 #include "lint.h"
28 #include "thr_uberdata.h"
29 #include <sys/sdt.h>
30 
31 #define	TRY_FLAG		0x10
32 #define	READ_LOCK		0
33 #define	WRITE_LOCK		1
34 #define	READ_LOCK_TRY		(READ_LOCK | TRY_FLAG)
35 #define	WRITE_LOCK_TRY		(WRITE_LOCK | TRY_FLAG)
36 
37 #define	NLOCKS	4	/* initial number of readlock_t structs allocated */
38 
39 #define	ASSERT_CONSISTENT_STATE(readers)		\
40 	ASSERT(!((readers) & URW_WRITE_LOCKED) ||	\
41 		((readers) & ~URW_HAS_WAITERS) == URW_WRITE_LOCKED)
42 
43 /*
44  * Find/allocate an entry for rwlp in our array of rwlocks held for reading.
45  * We must be deferring signals for this to be safe.
46  * Else if we are returning an entry with ul_rdlockcnt == 0,
47  * it could be reassigned behind our back in a signal handler.
48  */
49 static readlock_t *
50 rwl_entry(rwlock_t *rwlp)
51 {
52 	ulwp_t *self = curthread;
53 	readlock_t *remembered = NULL;
54 	readlock_t *readlockp;
55 	uint_t nlocks;
56 
57 	/* we must be deferring signals */
58 	ASSERT((self->ul_critical + self->ul_sigdefer) != 0);
59 
60 	if ((nlocks = self->ul_rdlockcnt) != 0)
61 		readlockp = self->ul_readlock.array;
62 	else {
63 		nlocks = 1;
64 		readlockp = &self->ul_readlock.single;
65 	}
66 
67 	for (; nlocks; nlocks--, readlockp++) {
68 		if (readlockp->rd_rwlock == rwlp)
69 			return (readlockp);
70 		if (readlockp->rd_count == 0 && remembered == NULL)
71 			remembered = readlockp;
72 	}
73 	if (remembered != NULL) {
74 		remembered->rd_rwlock = rwlp;
75 		return (remembered);
76 	}
77 
78 	/*
79 	 * No entry available.  Allocate more space, converting the single
80 	 * readlock_t entry into an array of readlock_t entries if necessary.
81 	 */
82 	if ((nlocks = self->ul_rdlockcnt) == 0) {
83 		/*
84 		 * Initial allocation of the readlock_t array.
85 		 * Convert the single entry into an array.
86 		 */
87 		self->ul_rdlockcnt = nlocks = NLOCKS;
88 		readlockp = lmalloc(nlocks * sizeof (readlock_t));
89 		/*
90 		 * The single readlock_t becomes the first entry in the array.
91 		 */
92 		*readlockp = self->ul_readlock.single;
93 		self->ul_readlock.single.rd_count = 0;
94 		self->ul_readlock.array = readlockp;
95 		/*
96 		 * Return the next available entry in the array.
97 		 */
98 		(++readlockp)->rd_rwlock = rwlp;
99 		return (readlockp);
100 	}
101 	/*
102 	 * Reallocate the array, double the size each time.
103 	 */
104 	readlockp = lmalloc(nlocks * 2 * sizeof (readlock_t));
105 	(void) memcpy(readlockp, self->ul_readlock.array,
106 	    nlocks * sizeof (readlock_t));
107 	lfree(self->ul_readlock.array, nlocks * sizeof (readlock_t));
108 	self->ul_readlock.array = readlockp;
109 	self->ul_rdlockcnt *= 2;
110 	/*
111 	 * Return the next available entry in the newly allocated array.
112 	 */
113 	(readlockp += nlocks)->rd_rwlock = rwlp;
114 	return (readlockp);
115 }
116 
117 /*
118  * Free the array of rwlocks held for reading.
119  */
120 void
121 rwl_free(ulwp_t *ulwp)
122 {
123 	uint_t nlocks;
124 
125 	if ((nlocks = ulwp->ul_rdlockcnt) != 0)
126 		lfree(ulwp->ul_readlock.array, nlocks * sizeof (readlock_t));
127 	ulwp->ul_rdlockcnt = 0;
128 	ulwp->ul_readlock.single.rd_rwlock = NULL;
129 	ulwp->ul_readlock.single.rd_count = 0;
130 }
131 
132 /*
133  * Check if a reader version of the lock is held by the current thread.
134  */
135 #pragma weak _rw_read_held = rw_read_held
136 int
137 rw_read_held(rwlock_t *rwlp)
138 {
139 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
140 	uint32_t readers;
141 	ulwp_t *self = curthread;
142 	readlock_t *readlockp;
143 	uint_t nlocks;
144 	int rval = 0;
145 
146 	no_preempt(self);
147 
148 	readers = *rwstate;
149 	ASSERT_CONSISTENT_STATE(readers);
150 	if (!(readers & URW_WRITE_LOCKED) &&
151 	    (readers & URW_READERS_MASK) != 0) {
152 		/*
153 		 * The lock is held for reading by some thread.
154 		 * Search our array of rwlocks held for reading for a match.
155 		 */
156 		if ((nlocks = self->ul_rdlockcnt) != 0)
157 			readlockp = self->ul_readlock.array;
158 		else {
159 			nlocks = 1;
160 			readlockp = &self->ul_readlock.single;
161 		}
162 		for (; nlocks; nlocks--, readlockp++) {
163 			if (readlockp->rd_rwlock == rwlp) {
164 				if (readlockp->rd_count)
165 					rval = 1;
166 				break;
167 			}
168 		}
169 	}
170 
171 	preempt(self);
172 	return (rval);
173 }
174 
175 /*
176  * Check if a writer version of the lock is held by the current thread.
177  */
178 #pragma weak _rw_write_held = rw_write_held
179 int
180 rw_write_held(rwlock_t *rwlp)
181 {
182 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
183 	uint32_t readers;
184 	ulwp_t *self = curthread;
185 	int rval;
186 
187 	no_preempt(self);
188 
189 	readers = *rwstate;
190 	ASSERT_CONSISTENT_STATE(readers);
191 	rval = ((readers & URW_WRITE_LOCKED) &&
192 	    rwlp->rwlock_owner == (uintptr_t)self &&
193 	    (rwlp->rwlock_type == USYNC_THREAD ||
194 	    rwlp->rwlock_ownerpid == self->ul_uberdata->pid));
195 
196 	preempt(self);
197 	return (rval);
198 }
199 
200 #pragma weak _rwlock_init = rwlock_init
201 int
202 rwlock_init(rwlock_t *rwlp, int type, void *arg __unused)
203 {
204 	ulwp_t *self = curthread;
205 
206 	if (type != USYNC_THREAD && type != USYNC_PROCESS)
207 		return (EINVAL);
208 	/*
209 	 * Once reinitialized, we can no longer be holding a read or write lock.
210 	 * We can do nothing about other threads that are holding read locks.
211 	 */
212 	sigoff(self);
213 	rwl_entry(rwlp)->rd_count = 0;
214 	sigon(self);
215 	(void) memset(rwlp, 0, sizeof (*rwlp));
216 	rwlp->rwlock_type = (uint16_t)type;
217 	rwlp->rwlock_magic = RWL_MAGIC;
218 	rwlp->mutex.mutex_type = (uint8_t)type;
219 	rwlp->mutex.mutex_flag = LOCK_INITED;
220 	rwlp->mutex.mutex_magic = MUTEX_MAGIC;
221 
222 	/*
223 	 * This should be at the beginning of the function,
224 	 * but for the sake of old broken applications that
225 	 * do not have proper alignment for their rwlocks
226 	 * (and don't check the return code from rwlock_init),
227 	 * we put it here, after initializing the rwlock regardless.
228 	 */
229 	if (((uintptr_t)rwlp & (_LONG_LONG_ALIGNMENT - 1)) &&
230 	    self->ul_misaligned == 0)
231 		return (EINVAL);
232 
233 	return (0);
234 }
235 
236 #pragma weak pthread_rwlock_destroy = rwlock_destroy
237 #pragma weak _rwlock_destroy = rwlock_destroy
238 int
239 rwlock_destroy(rwlock_t *rwlp)
240 {
241 	ulwp_t *self = curthread;
242 
243 	/*
244 	 * Once destroyed, we can no longer be holding a read or write lock.
245 	 * We can do nothing about other threads that are holding read locks.
246 	 */
247 	sigoff(self);
248 	rwl_entry(rwlp)->rd_count = 0;
249 	sigon(self);
250 	rwlp->rwlock_magic = 0;
251 	tdb_sync_obj_deregister(rwlp);
252 	return (0);
253 }
254 
255 /*
256  * The following four functions:
257  *	read_lock_try()
258  *	read_unlock_try()
259  *	write_lock_try()
260  *	write_unlock_try()
261  * lie at the heart of the fast-path code for rwlocks,
262  * both process-private and process-shared.
263  *
264  * They are called once without recourse to any other locking primitives.
265  * If they succeed, we are done and the fast-path code was successful.
266  * If they fail, we have to deal with lock queues, either to enqueue
267  * ourself and sleep or to dequeue and wake up someone else (slow paths).
268  *
269  * Unless 'ignore_waiters_flag' is true (a condition that applies only
270  * when read_lock_try() or write_lock_try() is called from code that
271  * is already in the slow path and has already acquired the queue lock),
272  * these functions will always fail if the waiters flag, URW_HAS_WAITERS,
273  * is set in the 'rwstate' word.  Thus, setting the waiters flag on the
274  * rwlock and acquiring the queue lock guarantees exclusive access to
275  * the rwlock (and is the only way to guarantee exclusive access).
276  */
277 
278 /*
279  * Attempt to acquire a readers lock.  Return true on success.
280  */
281 static int
282 read_lock_try(rwlock_t *rwlp, int ignore_waiters_flag)
283 {
284 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
285 	uint32_t mask = ignore_waiters_flag?
286 	    URW_WRITE_LOCKED : (URW_HAS_WAITERS | URW_WRITE_LOCKED);
287 	uint32_t readers;
288 	ulwp_t *self = curthread;
289 
290 	no_preempt(self);
291 	while (((readers = *rwstate) & mask) == 0) {
292 		if (atomic_cas_32(rwstate, readers, readers + 1) == readers) {
293 			preempt(self);
294 			return (1);
295 		}
296 	}
297 	preempt(self);
298 	return (0);
299 }
300 
301 /*
302  * Attempt to release a reader lock.  Return true on success.
303  */
304 static int
305 read_unlock_try(rwlock_t *rwlp)
306 {
307 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
308 	uint32_t readers;
309 	ulwp_t *self = curthread;
310 
311 	no_preempt(self);
312 	while (((readers = *rwstate) & URW_HAS_WAITERS) == 0) {
313 		if (atomic_cas_32(rwstate, readers, readers - 1) == readers) {
314 			preempt(self);
315 			return (1);
316 		}
317 	}
318 	preempt(self);
319 	return (0);
320 }
321 
322 /*
323  * Attempt to acquire a writer lock.  Return true on success.
324  */
325 static int
326 write_lock_try(rwlock_t *rwlp, int ignore_waiters_flag)
327 {
328 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
329 	uint32_t mask = ignore_waiters_flag?
330 	    (URW_WRITE_LOCKED | URW_READERS_MASK) :
331 	    (URW_HAS_WAITERS | URW_WRITE_LOCKED | URW_READERS_MASK);
332 	ulwp_t *self = curthread;
333 	uint32_t readers;
334 
335 	no_preempt(self);
336 	while (((readers = *rwstate) & mask) == 0) {
337 		if (atomic_cas_32(rwstate, readers, readers | URW_WRITE_LOCKED)
338 		    == readers) {
339 			preempt(self);
340 			return (1);
341 		}
342 	}
343 	preempt(self);
344 	return (0);
345 }
346 
347 /*
348  * Attempt to release a writer lock.  Return true on success.
349  */
350 static int
351 write_unlock_try(rwlock_t *rwlp)
352 {
353 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
354 	uint32_t readers;
355 	ulwp_t *self = curthread;
356 
357 	no_preempt(self);
358 	while (((readers = *rwstate) & URW_HAS_WAITERS) == 0) {
359 		if (atomic_cas_32(rwstate, readers, 0) == readers) {
360 			preempt(self);
361 			return (1);
362 		}
363 	}
364 	preempt(self);
365 	return (0);
366 }
367 
368 /*
369  * Release a process-private rwlock and wake up any thread(s) sleeping on it.
370  * This is called when a thread releases a lock that appears to have waiters.
371  */
372 static void
373 rw_queue_release(rwlock_t *rwlp)
374 {
375 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
376 	queue_head_t *qp;
377 	uint32_t readers;
378 	uint32_t writer;
379 	ulwp_t **ulwpp;
380 	ulwp_t *ulwp;
381 	ulwp_t *prev;
382 	int nlwpid = 0;
383 	int more;
384 	int maxlwps = MAXLWPS;
385 	lwpid_t buffer[MAXLWPS];
386 	lwpid_t *lwpid = buffer;
387 
388 	qp = queue_lock(rwlp, MX);
389 
390 	/*
391 	 * Here is where we actually drop the lock,
392 	 * but we retain the URW_HAS_WAITERS flag, if it is already set.
393 	 */
394 	readers = *rwstate;
395 	ASSERT_CONSISTENT_STATE(readers);
396 	if (readers & URW_WRITE_LOCKED)	/* drop the writer lock */
397 		atomic_and_32(rwstate, ~URW_WRITE_LOCKED);
398 	else				/* drop the readers lock */
399 		atomic_dec_32(rwstate);
400 	if (!(readers & URW_HAS_WAITERS)) {	/* no waiters */
401 		queue_unlock(qp);
402 		return;
403 	}
404 
405 	/*
406 	 * The presence of the URW_HAS_WAITERS flag causes all rwlock
407 	 * code to go through the slow path, acquiring queue_lock(qp).
408 	 * Therefore, the rest of this code is safe because we are
409 	 * holding the queue lock and the URW_HAS_WAITERS flag is set.
410 	 */
411 
412 	readers = *rwstate;		/* must fetch the value again */
413 	ASSERT_CONSISTENT_STATE(readers);
414 	ASSERT(readers & URW_HAS_WAITERS);
415 	readers &= URW_READERS_MASK;	/* count of current readers */
416 	writer = 0;			/* no current writer */
417 
418 	/*
419 	 * Examine the queue of waiters in priority order and prepare
420 	 * to wake up as many readers as we encounter before encountering
421 	 * a writer.  If the highest priority thread on the queue is a
422 	 * writer, stop there and wake it up.
423 	 *
424 	 * We keep track of lwpids that are to be unparked in lwpid[].
425 	 * __lwp_unpark_all() is called to unpark all of them after
426 	 * they have been removed from the sleep queue and the sleep
427 	 * queue lock has been dropped.  If we run out of space in our
428 	 * on-stack buffer, we need to allocate more but we can't call
429 	 * lmalloc() because we are holding a queue lock when the overflow
430 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
431 	 * either because the application may have allocated a small
432 	 * stack and we don't want to overrun the stack.  So we call
433 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
434 	 * system call directly since that path acquires no locks.
435 	 */
436 	while ((ulwpp = queue_slot(qp, &prev, &more)) != NULL) {
437 		ulwp = *ulwpp;
438 		ASSERT(ulwp->ul_wchan == rwlp);
439 		if (ulwp->ul_writer) {
440 			if (writer != 0 || readers != 0)
441 				break;
442 			/* one writer to wake */
443 			writer++;
444 		} else {
445 			if (writer != 0)
446 				break;
447 			/* at least one reader to wake */
448 			readers++;
449 			if (nlwpid == maxlwps)
450 				lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
451 		}
452 		queue_unlink(qp, ulwpp, prev);
453 		ulwp->ul_sleepq = NULL;
454 		ulwp->ul_wchan = NULL;
455 		if (writer) {
456 			/*
457 			 * Hand off the lock to the writer we will be waking.
458 			 */
459 			ASSERT((*rwstate & ~URW_HAS_WAITERS) == 0);
460 			atomic_or_32(rwstate, URW_WRITE_LOCKED);
461 			rwlp->rwlock_owner = (uintptr_t)ulwp;
462 		}
463 		lwpid[nlwpid++] = ulwp->ul_lwpid;
464 	}
465 
466 	/*
467 	 * This modification of rwstate must be done last.
468 	 * The presence of the URW_HAS_WAITERS flag causes all rwlock
469 	 * code to go through the slow path, acquiring queue_lock(qp).
470 	 * Otherwise the read_lock_try() and write_lock_try() fast paths
471 	 * are effective.
472 	 */
473 	if (ulwpp == NULL)
474 		atomic_and_32(rwstate, ~URW_HAS_WAITERS);
475 
476 	if (nlwpid == 0) {
477 		queue_unlock(qp);
478 	} else {
479 		ulwp_t *self = curthread;
480 		no_preempt(self);
481 		queue_unlock(qp);
482 		if (nlwpid == 1)
483 			(void) __lwp_unpark(lwpid[0]);
484 		else
485 			(void) __lwp_unpark_all(lwpid, nlwpid);
486 		preempt(self);
487 	}
488 	if (lwpid != buffer)
489 		(void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
490 }
491 
492 /*
493  * Common code for rdlock, timedrdlock, wrlock, timedwrlock, tryrdlock,
494  * and trywrlock for process-shared (USYNC_PROCESS) rwlocks.
495  *
496  * Note: if the lock appears to be contended we call __lwp_rwlock_rdlock()
497  * or __lwp_rwlock_wrlock() holding the mutex. These return with the mutex
498  * released, and if they need to sleep will release the mutex first. In the
499  * event of a spurious wakeup, these will return EAGAIN (because it is much
500  * easier for us to re-acquire the mutex here).
501  */
502 int
503 shared_rwlock_lock(rwlock_t *rwlp, timespec_t *tsp, int rd_wr)
504 {
505 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
506 	mutex_t *mp = &rwlp->mutex;
507 	int try_flag;
508 	int error;
509 
510 	try_flag = (rd_wr & TRY_FLAG);
511 	rd_wr &= ~TRY_FLAG;
512 	ASSERT(rd_wr == READ_LOCK || rd_wr == WRITE_LOCK);
513 
514 	if (!try_flag) {
515 		DTRACE_PROBE2(plockstat, rw__block, rwlp, rd_wr);
516 	}
517 
518 	do {
519 		if (try_flag && (*rwstate & URW_WRITE_LOCKED)) {
520 			error = EBUSY;
521 			break;
522 		}
523 		if ((error = mutex_lock(mp)) != 0)
524 			break;
525 		if (rd_wr == READ_LOCK) {
526 			if (read_lock_try(rwlp, 0)) {
527 				(void) mutex_unlock(mp);
528 				break;
529 			}
530 		} else {
531 			if (write_lock_try(rwlp, 0)) {
532 				(void) mutex_unlock(mp);
533 				break;
534 			}
535 		}
536 		atomic_or_32(rwstate, URW_HAS_WAITERS);
537 
538 #ifdef THREAD_DEBUG
539 		uint32_t readers;
540 		readers = *rwstate;
541 		ASSERT_CONSISTENT_STATE(readers);
542 #endif
543 		/*
544 		 * The calls to __lwp_rwlock_*() below will release the mutex,
545 		 * so we need a dtrace probe here.  The owner field of the
546 		 * mutex is cleared in the kernel when the mutex is released,
547 		 * so we should not clear it here.
548 		 */
549 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
550 		/*
551 		 * The waiters bit may be inaccurate.
552 		 * Only the kernel knows for sure.
553 		 */
554 		if (rd_wr == READ_LOCK) {
555 			if (try_flag)
556 				error = __lwp_rwlock_tryrdlock(rwlp);
557 			else
558 				error = __lwp_rwlock_rdlock(rwlp, tsp);
559 		} else {
560 			if (try_flag)
561 				error = __lwp_rwlock_trywrlock(rwlp);
562 			else
563 				error = __lwp_rwlock_wrlock(rwlp, tsp);
564 		}
565 	} while (error == EAGAIN || error == EINTR);
566 
567 	if (!try_flag) {
568 		DTRACE_PROBE3(plockstat, rw__blocked, rwlp, rd_wr, error == 0);
569 	}
570 
571 	return (error);
572 }
573 
574 /*
575  * Common code for rdlock, timedrdlock, wrlock, timedwrlock, tryrdlock,
576  * and trywrlock for process-private (USYNC_THREAD) rwlocks.
577  */
578 int
579 rwlock_lock(rwlock_t *rwlp, timespec_t *tsp, int rd_wr)
580 {
581 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
582 	uint32_t readers;
583 	ulwp_t *self = curthread;
584 	queue_head_t *qp;
585 	ulwp_t *ulwp;
586 	int try_flag;
587 	int ignore_waiters_flag;
588 	int error = 0;
589 
590 	try_flag = (rd_wr & TRY_FLAG);
591 	rd_wr &= ~TRY_FLAG;
592 	ASSERT(rd_wr == READ_LOCK || rd_wr == WRITE_LOCK);
593 
594 	if (!try_flag) {
595 		DTRACE_PROBE2(plockstat, rw__block, rwlp, rd_wr);
596 	}
597 
598 	qp = queue_lock(rwlp, MX);
599 	/* initial attempt to acquire the lock fails if there are waiters */
600 	ignore_waiters_flag = 0;
601 	while (error == 0) {
602 		if (rd_wr == READ_LOCK) {
603 			if (read_lock_try(rwlp, ignore_waiters_flag))
604 				break;
605 		} else {
606 			if (write_lock_try(rwlp, ignore_waiters_flag))
607 				break;
608 		}
609 		/* subsequent attempts do not fail due to waiters */
610 		ignore_waiters_flag = 1;
611 		atomic_or_32(rwstate, URW_HAS_WAITERS);
612 		readers = *rwstate;
613 		ASSERT_CONSISTENT_STATE(readers);
614 		if ((readers & URW_WRITE_LOCKED) ||
615 		    (rd_wr == WRITE_LOCK &&
616 		    (readers & URW_READERS_MASK) != 0))
617 			/* EMPTY */;	/* somebody holds the lock */
618 		else if ((ulwp = queue_waiter(qp)) == NULL) {
619 			atomic_and_32(rwstate, ~URW_HAS_WAITERS);
620 			ignore_waiters_flag = 0;
621 			continue;	/* no queued waiters, start over */
622 		} else {
623 			/*
624 			 * Do a priority check on the queued waiter (the
625 			 * highest priority thread on the queue) to see
626 			 * if we should defer to it or just grab the lock.
627 			 */
628 			int our_pri = real_priority(self);
629 			int his_pri = real_priority(ulwp);
630 
631 			if (rd_wr == WRITE_LOCK) {
632 				/*
633 				 * We defer to a queued thread that has
634 				 * a higher priority than ours.
635 				 */
636 				if (his_pri <= our_pri) {
637 					/*
638 					 * Don't defer, just grab the lock.
639 					 */
640 					continue;
641 				}
642 			} else {
643 				/*
644 				 * We defer to a queued thread that has
645 				 * a higher priority than ours or that
646 				 * is a writer whose priority equals ours.
647 				 */
648 				if (his_pri < our_pri ||
649 				    (his_pri == our_pri && !ulwp->ul_writer)) {
650 					/*
651 					 * Don't defer, just grab the lock.
652 					 */
653 					continue;
654 				}
655 			}
656 		}
657 		/*
658 		 * We are about to block.
659 		 * If we're doing a trylock, return EBUSY instead.
660 		 */
661 		if (try_flag) {
662 			error = EBUSY;
663 			break;
664 		}
665 		/*
666 		 * Enqueue writers ahead of readers.
667 		 */
668 		self->ul_writer = rd_wr;	/* *must* be 0 or 1 */
669 		enqueue(qp, self, 0);
670 		set_parking_flag(self, 1);
671 		queue_unlock(qp);
672 		if ((error = __lwp_park(tsp, 0)) == EINTR)
673 			error = 0;
674 		set_parking_flag(self, 0);
675 		qp = queue_lock(rwlp, MX);
676 		if (self->ul_sleepq && dequeue_self(qp) == 0) {
677 			atomic_and_32(rwstate, ~URW_HAS_WAITERS);
678 			ignore_waiters_flag = 0;
679 		}
680 		self->ul_writer = 0;
681 		if (rd_wr == WRITE_LOCK &&
682 		    (*rwstate & URW_WRITE_LOCKED) &&
683 		    rwlp->rwlock_owner == (uintptr_t)self) {
684 			/*
685 			 * We acquired the lock by hand-off
686 			 * from the previous owner,
687 			 */
688 			error = 0;	/* timedlock did not fail */
689 			break;
690 		}
691 	}
692 
693 	/*
694 	 * Make one final check to see if there are any threads left
695 	 * on the rwlock queue.  Clear the URW_HAS_WAITERS flag if not.
696 	 */
697 	if (qp->qh_root == NULL || qp->qh_root->qr_head == NULL)
698 		atomic_and_32(rwstate, ~URW_HAS_WAITERS);
699 
700 	queue_unlock(qp);
701 
702 	if (!try_flag) {
703 		DTRACE_PROBE3(plockstat, rw__blocked, rwlp, rd_wr, error == 0);
704 	}
705 
706 	return (error);
707 }
708 
709 int
710 rw_rdlock_impl(rwlock_t *rwlp, timespec_t *tsp)
711 {
712 	ulwp_t *self = curthread;
713 	uberdata_t *udp = self->ul_uberdata;
714 	readlock_t *readlockp;
715 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
716 	int error;
717 
718 	/*
719 	 * If we already hold a readers lock on this rwlock,
720 	 * just increment our reference count and return.
721 	 */
722 	sigoff(self);
723 	readlockp = rwl_entry(rwlp);
724 	if (readlockp->rd_count != 0) {
725 		if (readlockp->rd_count == READ_LOCK_MAX) {
726 			sigon(self);
727 			error = EAGAIN;
728 			goto out;
729 		}
730 		sigon(self);
731 		error = 0;
732 		goto out;
733 	}
734 	sigon(self);
735 
736 	/*
737 	 * If we hold the writer lock, bail out.
738 	 */
739 	if (rw_write_held(rwlp)) {
740 		if (self->ul_error_detection)
741 			rwlock_error(rwlp, "rwlock_rdlock",
742 			    "calling thread owns the writer lock");
743 		error = EDEADLK;
744 		goto out;
745 	}
746 
747 	if (read_lock_try(rwlp, 0))
748 		error = 0;
749 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
750 		error = shared_rwlock_lock(rwlp, tsp, READ_LOCK);
751 	else						/* user-level */
752 		error = rwlock_lock(rwlp, tsp, READ_LOCK);
753 
754 out:
755 	if (error == 0) {
756 		sigoff(self);
757 		rwl_entry(rwlp)->rd_count++;
758 		sigon(self);
759 		if (rwsp)
760 			tdb_incr(rwsp->rw_rdlock);
761 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, READ_LOCK);
762 	} else {
763 		DTRACE_PROBE3(plockstat, rw__error, rwlp, READ_LOCK, error);
764 	}
765 
766 	return (error);
767 }
768 
769 #pragma weak pthread_rwlock_rdlock = rw_rdlock
770 #pragma weak _rw_rdlock = rw_rdlock
771 int
772 rw_rdlock(rwlock_t *rwlp)
773 {
774 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
775 	return (rw_rdlock_impl(rwlp, NULL));
776 }
777 
778 void
779 lrw_rdlock(rwlock_t *rwlp)
780 {
781 	enter_critical(curthread);
782 	(void) rw_rdlock_impl(rwlp, NULL);
783 }
784 
785 int
786 pthread_rwlock_reltimedrdlock_np(pthread_rwlock_t *_RESTRICT_KYWD rwlp,
787     const struct timespec *_RESTRICT_KYWD reltime)
788 {
789 	timespec_t tslocal = *reltime;
790 	int error;
791 
792 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
793 	error = rw_rdlock_impl((rwlock_t *)rwlp, &tslocal);
794 	if (error == ETIME)
795 		error = ETIMEDOUT;
796 	return (error);
797 }
798 
799 int
800 pthread_rwlock_timedrdlock(pthread_rwlock_t *_RESTRICT_KYWD rwlp,
801     const struct timespec *_RESTRICT_KYWD abstime)
802 {
803 	timespec_t tslocal;
804 	int error;
805 
806 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
807 	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
808 	error = rw_rdlock_impl((rwlock_t *)rwlp, &tslocal);
809 	if (error == ETIME)
810 		error = ETIMEDOUT;
811 	return (error);
812 }
813 
814 int
815 rw_wrlock_impl(rwlock_t *rwlp, timespec_t *tsp)
816 {
817 	ulwp_t *self = curthread;
818 	uberdata_t *udp = self->ul_uberdata;
819 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
820 	int error;
821 
822 	/*
823 	 * If we hold a readers lock on this rwlock, bail out.
824 	 */
825 	if (rw_read_held(rwlp)) {
826 		if (self->ul_error_detection)
827 			rwlock_error(rwlp, "rwlock_wrlock",
828 			    "calling thread owns the readers lock");
829 		error = EDEADLK;
830 		goto out;
831 	}
832 
833 	/*
834 	 * If we hold the writer lock, bail out.
835 	 */
836 	if (rw_write_held(rwlp)) {
837 		if (self->ul_error_detection)
838 			rwlock_error(rwlp, "rwlock_wrlock",
839 			    "calling thread owns the writer lock");
840 		error = EDEADLK;
841 		goto out;
842 	}
843 
844 	if (write_lock_try(rwlp, 0))
845 		error = 0;
846 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
847 		error = shared_rwlock_lock(rwlp, tsp, WRITE_LOCK);
848 	else						/* user-level */
849 		error = rwlock_lock(rwlp, tsp, WRITE_LOCK);
850 
851 out:
852 	if (error == 0) {
853 		rwlp->rwlock_owner = (uintptr_t)self;
854 		if (rwlp->rwlock_type == USYNC_PROCESS)
855 			rwlp->rwlock_ownerpid = udp->pid;
856 		if (rwsp) {
857 			tdb_incr(rwsp->rw_wrlock);
858 			rwsp->rw_wrlock_begin_hold = gethrtime();
859 		}
860 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, WRITE_LOCK);
861 	} else {
862 		DTRACE_PROBE3(plockstat, rw__error, rwlp, WRITE_LOCK, error);
863 	}
864 	return (error);
865 }
866 
867 #pragma weak pthread_rwlock_wrlock = rw_wrlock
868 #pragma weak _rw_wrlock = rw_wrlock
869 int
870 rw_wrlock(rwlock_t *rwlp)
871 {
872 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
873 	return (rw_wrlock_impl(rwlp, NULL));
874 }
875 
876 void
877 lrw_wrlock(rwlock_t *rwlp)
878 {
879 	enter_critical(curthread);
880 	(void) rw_wrlock_impl(rwlp, NULL);
881 }
882 
883 int
884 pthread_rwlock_reltimedwrlock_np(pthread_rwlock_t *_RESTRICT_KYWD rwlp,
885     const struct timespec *_RESTRICT_KYWD reltime)
886 {
887 	timespec_t tslocal = *reltime;
888 	int error;
889 
890 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
891 	error = rw_wrlock_impl((rwlock_t *)rwlp, &tslocal);
892 	if (error == ETIME)
893 		error = ETIMEDOUT;
894 	return (error);
895 }
896 
897 int
898 pthread_rwlock_timedwrlock(pthread_rwlock_t *rwlp, const timespec_t *abstime)
899 {
900 	timespec_t tslocal;
901 	int error;
902 
903 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
904 	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
905 	error = rw_wrlock_impl((rwlock_t *)rwlp, &tslocal);
906 	if (error == ETIME)
907 		error = ETIMEDOUT;
908 	return (error);
909 }
910 
911 #pragma weak pthread_rwlock_tryrdlock = rw_tryrdlock
912 int
913 rw_tryrdlock(rwlock_t *rwlp)
914 {
915 	ulwp_t *self = curthread;
916 	uberdata_t *udp = self->ul_uberdata;
917 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
918 	readlock_t *readlockp;
919 	int error;
920 
921 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
922 
923 	if (rwsp)
924 		tdb_incr(rwsp->rw_rdlock_try);
925 
926 	/*
927 	 * If we already hold a readers lock on this rwlock,
928 	 * just increment our reference count and return.
929 	 */
930 	sigoff(self);
931 	readlockp = rwl_entry(rwlp);
932 	if (readlockp->rd_count != 0) {
933 		if (readlockp->rd_count == READ_LOCK_MAX) {
934 			sigon(self);
935 			error = EAGAIN;
936 			goto out;
937 		}
938 		sigon(self);
939 		error = 0;
940 		goto out;
941 	}
942 	sigon(self);
943 
944 	if (read_lock_try(rwlp, 0))
945 		error = 0;
946 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
947 		error = shared_rwlock_lock(rwlp, NULL, READ_LOCK_TRY);
948 	else						/* user-level */
949 		error = rwlock_lock(rwlp, NULL, READ_LOCK_TRY);
950 
951 out:
952 	if (error == 0) {
953 		sigoff(self);
954 		rwl_entry(rwlp)->rd_count++;
955 		sigon(self);
956 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, READ_LOCK);
957 	} else {
958 		if (rwsp)
959 			tdb_incr(rwsp->rw_rdlock_try_fail);
960 		if (error != EBUSY) {
961 			DTRACE_PROBE3(plockstat, rw__error, rwlp, READ_LOCK,
962 			    error);
963 		}
964 	}
965 
966 	return (error);
967 }
968 
969 #pragma weak pthread_rwlock_trywrlock = rw_trywrlock
970 int
971 rw_trywrlock(rwlock_t *rwlp)
972 {
973 	ulwp_t *self = curthread;
974 	uberdata_t *udp = self->ul_uberdata;
975 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
976 	int error;
977 
978 	ASSERT(!self->ul_critical || self->ul_bindflags);
979 
980 	if (rwsp)
981 		tdb_incr(rwsp->rw_wrlock_try);
982 
983 	if (write_lock_try(rwlp, 0))
984 		error = 0;
985 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
986 		error = shared_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY);
987 	else						/* user-level */
988 		error = rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY);
989 
990 	if (error == 0) {
991 		rwlp->rwlock_owner = (uintptr_t)self;
992 		if (rwlp->rwlock_type == USYNC_PROCESS)
993 			rwlp->rwlock_ownerpid = udp->pid;
994 		if (rwsp)
995 			rwsp->rw_wrlock_begin_hold = gethrtime();
996 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, WRITE_LOCK);
997 	} else {
998 		if (rwsp)
999 			tdb_incr(rwsp->rw_wrlock_try_fail);
1000 		if (error != EBUSY) {
1001 			DTRACE_PROBE3(plockstat, rw__error, rwlp, WRITE_LOCK,
1002 			    error);
1003 		}
1004 	}
1005 	return (error);
1006 }
1007 
1008 #pragma weak pthread_rwlock_unlock = rw_unlock
1009 #pragma weak _rw_unlock = rw_unlock
1010 int
1011 rw_unlock(rwlock_t *rwlp)
1012 {
1013 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
1014 	uint32_t readers;
1015 	ulwp_t *self = curthread;
1016 	uberdata_t *udp = self->ul_uberdata;
1017 	tdb_rwlock_stats_t *rwsp;
1018 	int rd_wr;
1019 
1020 	readers = *rwstate;
1021 	ASSERT_CONSISTENT_STATE(readers);
1022 	if (readers & URW_WRITE_LOCKED) {
1023 		rd_wr = WRITE_LOCK;
1024 		readers = 0;
1025 	} else {
1026 		rd_wr = READ_LOCK;
1027 		readers &= URW_READERS_MASK;
1028 	}
1029 
1030 	if (rd_wr == WRITE_LOCK) {
1031 		/*
1032 		 * Since the writer lock is held, we'd better be
1033 		 * holding it, else we cannot legitimately be here.
1034 		 */
1035 		if (!rw_write_held(rwlp)) {
1036 			if (self->ul_error_detection)
1037 				rwlock_error(rwlp, "rwlock_unlock",
1038 				    "writer lock held, "
1039 				    "but not by the calling thread");
1040 			return (EPERM);
1041 		}
1042 		if ((rwsp = RWLOCK_STATS(rwlp, udp)) != NULL) {
1043 			if (rwsp->rw_wrlock_begin_hold)
1044 				rwsp->rw_wrlock_hold_time +=
1045 				    gethrtime() - rwsp->rw_wrlock_begin_hold;
1046 			rwsp->rw_wrlock_begin_hold = 0;
1047 		}
1048 		rwlp->rwlock_owner = 0;
1049 		rwlp->rwlock_ownerpid = 0;
1050 	} else if (readers > 0) {
1051 		/*
1052 		 * A readers lock is held; if we don't hold one, bail out.
1053 		 */
1054 		readlock_t *readlockp;
1055 
1056 		sigoff(self);
1057 		readlockp = rwl_entry(rwlp);
1058 		if (readlockp->rd_count == 0) {
1059 			sigon(self);
1060 			if (self->ul_error_detection)
1061 				rwlock_error(rwlp, "rwlock_unlock",
1062 				    "readers lock held, "
1063 				    "but not by the calling thread");
1064 			return (EPERM);
1065 		}
1066 		/*
1067 		 * If we hold more than one readers lock on this rwlock,
1068 		 * just decrement our reference count and return.
1069 		 */
1070 		if (--readlockp->rd_count != 0) {
1071 			sigon(self);
1072 			goto out;
1073 		}
1074 		sigon(self);
1075 	} else {
1076 		/*
1077 		 * This is a usage error.
1078 		 * No thread should release an unowned lock.
1079 		 */
1080 		if (self->ul_error_detection)
1081 			rwlock_error(rwlp, "rwlock_unlock", "lock not owned");
1082 		return (EPERM);
1083 	}
1084 
1085 	if (rd_wr == WRITE_LOCK && write_unlock_try(rwlp)) {
1086 		/* EMPTY */;
1087 	} else if (rd_wr == READ_LOCK && read_unlock_try(rwlp)) {
1088 		/* EMPTY */;
1089 	} else if (rwlp->rwlock_type == USYNC_PROCESS) {
1090 		(void) mutex_lock(&rwlp->mutex);
1091 		(void) __lwp_rwlock_unlock(rwlp);
1092 		(void) mutex_unlock(&rwlp->mutex);
1093 	} else {
1094 		rw_queue_release(rwlp);
1095 	}
1096 
1097 out:
1098 	DTRACE_PROBE2(plockstat, rw__release, rwlp, rd_wr);
1099 	return (0);
1100 }
1101 
1102 void
1103 lrw_unlock(rwlock_t *rwlp)
1104 {
1105 	(void) rw_unlock(rwlp);
1106 	exit_critical(curthread);
1107 }
1108