xref: /titanic_44/usr/src/lib/libc/port/threads/rwlock.c (revision 58d1a73c51105d779ddacf2ae9553bae44a39ff4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include "lint.h"
30 #include "thr_uberdata.h"
31 #include <sys/sdt.h>
32 
33 #define	TRY_FLAG		0x10
34 #define	READ_LOCK		0
35 #define	WRITE_LOCK		1
36 #define	READ_LOCK_TRY		(READ_LOCK | TRY_FLAG)
37 #define	WRITE_LOCK_TRY		(WRITE_LOCK | TRY_FLAG)
38 
39 #define	NLOCKS	4	/* initial number of readlock_t structs allocated */
40 
41 #define	ASSERT_CONSISTENT_STATE(readers)		\
42 	ASSERT(!((readers) & URW_WRITE_LOCKED) ||	\
43 		((readers) & ~URW_HAS_WAITERS) == URW_WRITE_LOCKED)
44 
45 /*
46  * Find/allocate an entry for rwlp in our array of rwlocks held for reading.
47  * We must be deferring signals for this to be safe.
48  * Else if we are returning an entry with ul_rdlockcnt == 0,
49  * it could be reassigned behind our back in a signal handler.
50  */
51 static readlock_t *
52 rwl_entry(rwlock_t *rwlp)
53 {
54 	ulwp_t *self = curthread;
55 	readlock_t *remembered = NULL;
56 	readlock_t *readlockp;
57 	uint_t nlocks;
58 
59 	/* we must be deferring signals */
60 	ASSERT((self->ul_critical + self->ul_sigdefer) != 0);
61 
62 	if ((nlocks = self->ul_rdlockcnt) != 0)
63 		readlockp = self->ul_readlock.array;
64 	else {
65 		nlocks = 1;
66 		readlockp = &self->ul_readlock.single;
67 	}
68 
69 	for (; nlocks; nlocks--, readlockp++) {
70 		if (readlockp->rd_rwlock == rwlp)
71 			return (readlockp);
72 		if (readlockp->rd_count == 0 && remembered == NULL)
73 			remembered = readlockp;
74 	}
75 	if (remembered != NULL) {
76 		remembered->rd_rwlock = rwlp;
77 		return (remembered);
78 	}
79 
80 	/*
81 	 * No entry available.  Allocate more space, converting the single
82 	 * readlock_t entry into an array of readlock_t entries if necessary.
83 	 */
84 	if ((nlocks = self->ul_rdlockcnt) == 0) {
85 		/*
86 		 * Initial allocation of the readlock_t array.
87 		 * Convert the single entry into an array.
88 		 */
89 		self->ul_rdlockcnt = nlocks = NLOCKS;
90 		readlockp = lmalloc(nlocks * sizeof (readlock_t));
91 		/*
92 		 * The single readlock_t becomes the first entry in the array.
93 		 */
94 		*readlockp = self->ul_readlock.single;
95 		self->ul_readlock.single.rd_count = 0;
96 		self->ul_readlock.array = readlockp;
97 		/*
98 		 * Return the next available entry in the array.
99 		 */
100 		(++readlockp)->rd_rwlock = rwlp;
101 		return (readlockp);
102 	}
103 	/*
104 	 * Reallocate the array, double the size each time.
105 	 */
106 	readlockp = lmalloc(nlocks * 2 * sizeof (readlock_t));
107 	(void) memcpy(readlockp, self->ul_readlock.array,
108 	    nlocks * sizeof (readlock_t));
109 	lfree(self->ul_readlock.array, nlocks * sizeof (readlock_t));
110 	self->ul_readlock.array = readlockp;
111 	self->ul_rdlockcnt *= 2;
112 	/*
113 	 * Return the next available entry in the newly allocated array.
114 	 */
115 	(readlockp += nlocks)->rd_rwlock = rwlp;
116 	return (readlockp);
117 }
118 
119 /*
120  * Free the array of rwlocks held for reading.
121  */
122 void
123 rwl_free(ulwp_t *ulwp)
124 {
125 	uint_t nlocks;
126 
127 	if ((nlocks = ulwp->ul_rdlockcnt) != 0)
128 		lfree(ulwp->ul_readlock.array, nlocks * sizeof (readlock_t));
129 	ulwp->ul_rdlockcnt = 0;
130 	ulwp->ul_readlock.single.rd_rwlock = NULL;
131 	ulwp->ul_readlock.single.rd_count = 0;
132 }
133 
134 /*
135  * Check if a reader version of the lock is held by the current thread.
136  */
137 #pragma weak _rw_read_held = rw_read_held
138 int
139 rw_read_held(rwlock_t *rwlp)
140 {
141 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
142 	uint32_t readers;
143 	ulwp_t *self = curthread;
144 	readlock_t *readlockp;
145 	uint_t nlocks;
146 	int rval = 0;
147 
148 	no_preempt(self);
149 
150 	readers = *rwstate;
151 	ASSERT_CONSISTENT_STATE(readers);
152 	if (!(readers & URW_WRITE_LOCKED) &&
153 	    (readers & URW_READERS_MASK) != 0) {
154 		/*
155 		 * The lock is held for reading by some thread.
156 		 * Search our array of rwlocks held for reading for a match.
157 		 */
158 		if ((nlocks = self->ul_rdlockcnt) != 0)
159 			readlockp = self->ul_readlock.array;
160 		else {
161 			nlocks = 1;
162 			readlockp = &self->ul_readlock.single;
163 		}
164 		for (; nlocks; nlocks--, readlockp++) {
165 			if (readlockp->rd_rwlock == rwlp) {
166 				if (readlockp->rd_count)
167 					rval = 1;
168 				break;
169 			}
170 		}
171 	}
172 
173 	preempt(self);
174 	return (rval);
175 }
176 
177 /*
178  * Check if a writer version of the lock is held by the current thread.
179  */
180 #pragma weak _rw_write_held = rw_write_held
181 int
182 rw_write_held(rwlock_t *rwlp)
183 {
184 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
185 	uint32_t readers;
186 	ulwp_t *self = curthread;
187 	int rval;
188 
189 	no_preempt(self);
190 
191 	readers = *rwstate;
192 	ASSERT_CONSISTENT_STATE(readers);
193 	rval = ((readers & URW_WRITE_LOCKED) &&
194 	    rwlp->rwlock_owner == (uintptr_t)self &&
195 	    (rwlp->rwlock_type == USYNC_THREAD ||
196 	    rwlp->rwlock_ownerpid == self->ul_uberdata->pid));
197 
198 	preempt(self);
199 	return (rval);
200 }
201 
202 #pragma weak _rwlock_init = rwlock_init
203 /* ARGSUSED2 */
204 int
205 rwlock_init(rwlock_t *rwlp, int type, void *arg)
206 {
207 	if (type != USYNC_THREAD && type != USYNC_PROCESS)
208 		return (EINVAL);
209 	/*
210 	 * Once reinitialized, we can no longer be holding a read or write lock.
211 	 * We can do nothing about other threads that are holding read locks.
212 	 */
213 	sigoff(curthread);
214 	rwl_entry(rwlp)->rd_count = 0;
215 	sigon(curthread);
216 	(void) memset(rwlp, 0, sizeof (*rwlp));
217 	rwlp->rwlock_type = (uint16_t)type;
218 	rwlp->rwlock_magic = RWL_MAGIC;
219 	rwlp->mutex.mutex_type = (uint8_t)type;
220 	rwlp->mutex.mutex_flag = LOCK_INITED;
221 	rwlp->mutex.mutex_magic = MUTEX_MAGIC;
222 	return (0);
223 }
224 
225 #pragma weak pthread_rwlock_destroy = rwlock_destroy
226 #pragma weak _rwlock_destroy = rwlock_destroy
227 int
228 rwlock_destroy(rwlock_t *rwlp)
229 {
230 	/*
231 	 * Once destroyed, we can no longer be holding a read or write lock.
232 	 * We can do nothing about other threads that are holding read locks.
233 	 */
234 	sigoff(curthread);
235 	rwl_entry(rwlp)->rd_count = 0;
236 	sigon(curthread);
237 	rwlp->rwlock_magic = 0;
238 	tdb_sync_obj_deregister(rwlp);
239 	return (0);
240 }
241 
242 /*
243  * Attempt to acquire a readers lock.  Return true on success.
244  */
245 static int
246 read_lock_try(rwlock_t *rwlp, int ignore_waiters_flag)
247 {
248 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
249 	uint32_t mask = ignore_waiters_flag?
250 	    URW_WRITE_LOCKED : (URW_HAS_WAITERS | URW_WRITE_LOCKED);
251 	uint32_t readers;
252 	ulwp_t *self = curthread;
253 
254 	no_preempt(self);
255 	while (((readers = *rwstate) & mask) == 0) {
256 		if (atomic_cas_32(rwstate, readers, readers + 1) == readers) {
257 			preempt(self);
258 			return (1);
259 		}
260 	}
261 	preempt(self);
262 	return (0);
263 }
264 
265 /*
266  * Attempt to release a reader lock.  Return true on success.
267  */
268 static int
269 read_unlock_try(rwlock_t *rwlp)
270 {
271 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
272 	uint32_t readers;
273 	ulwp_t *self = curthread;
274 
275 	no_preempt(self);
276 	while (((readers = *rwstate) & URW_HAS_WAITERS) == 0) {
277 		if (atomic_cas_32(rwstate, readers, readers - 1) == readers) {
278 			preempt(self);
279 			return (1);
280 		}
281 	}
282 	preempt(self);
283 	return (0);
284 }
285 
286 /*
287  * Attempt to acquire a writer lock.  Return true on success.
288  */
289 static int
290 write_lock_try(rwlock_t *rwlp, int ignore_waiters_flag)
291 {
292 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
293 	uint32_t mask = ignore_waiters_flag?
294 	    (URW_WRITE_LOCKED | URW_READERS_MASK) :
295 	    (URW_HAS_WAITERS | URW_WRITE_LOCKED | URW_READERS_MASK);
296 	ulwp_t *self = curthread;
297 	uint32_t readers;
298 
299 	no_preempt(self);
300 	while (((readers = *rwstate) & mask) == 0) {
301 		if (atomic_cas_32(rwstate, readers, readers | URW_WRITE_LOCKED)
302 		    == readers) {
303 			preempt(self);
304 			return (1);
305 		}
306 	}
307 	preempt(self);
308 	return (0);
309 }
310 
311 /*
312  * Attempt to release a writer lock.  Return true on success.
313  */
314 static int
315 write_unlock_try(rwlock_t *rwlp)
316 {
317 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
318 	uint32_t readers;
319 	ulwp_t *self = curthread;
320 
321 	no_preempt(self);
322 	while (((readers = *rwstate) & URW_HAS_WAITERS) == 0) {
323 		if (atomic_cas_32(rwstate, readers, 0) == readers) {
324 			preempt(self);
325 			return (1);
326 		}
327 	}
328 	preempt(self);
329 	return (0);
330 }
331 
332 /*
333  * Wake up thread(s) sleeping on the rwlock queue and then
334  * drop the queue lock.  Return non-zero if we wake up someone.
335  * This is called when a thread releases a lock that appears to have waiters.
336  */
337 static int
338 rw_queue_release(queue_head_t *qp, rwlock_t *rwlp)
339 {
340 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
341 	uint32_t readers;
342 	uint32_t writers;
343 	ulwp_t **ulwpp;
344 	ulwp_t *ulwp;
345 	ulwp_t *prev;
346 	int nlwpid = 0;
347 	int more;
348 	int maxlwps = MAXLWPS;
349 	lwpid_t buffer[MAXLWPS];
350 	lwpid_t *lwpid = buffer;
351 
352 	readers = *rwstate;
353 	ASSERT_CONSISTENT_STATE(readers);
354 	if (!(readers & URW_HAS_WAITERS)) {
355 		queue_unlock(qp);
356 		return (0);
357 	}
358 	readers &= URW_READERS_MASK;
359 	writers = 0;
360 
361 	/*
362 	 * Examine the queue of waiters in priority order and prepare
363 	 * to wake up as many readers as we encounter before encountering
364 	 * a writer.  If the highest priority thread on the queue is a
365 	 * writer, stop there and wake it up.
366 	 *
367 	 * We keep track of lwpids that are to be unparked in lwpid[].
368 	 * __lwp_unpark_all() is called to unpark all of them after
369 	 * they have been removed from the sleep queue and the sleep
370 	 * queue lock has been dropped.  If we run out of space in our
371 	 * on-stack buffer, we need to allocate more but we can't call
372 	 * lmalloc() because we are holding a queue lock when the overflow
373 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
374 	 * either because the application may have allocated a small
375 	 * stack and we don't want to overrun the stack.  So we call
376 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
377 	 * system call directly since that path acquires no locks.
378 	 */
379 	while ((ulwpp = queue_slot(qp, &prev, &more)) != NULL) {
380 		ulwp = *ulwpp;
381 		ASSERT(ulwp->ul_wchan == rwlp);
382 		if (ulwp->ul_writer) {
383 			if (writers != 0 || readers != 0)
384 				break;
385 			/* one writer to wake */
386 			writers++;
387 		} else {
388 			if (writers != 0)
389 				break;
390 			/* at least one reader to wake */
391 			readers++;
392 			if (nlwpid == maxlwps)
393 				lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
394 		}
395 		queue_unlink(qp, ulwpp, prev);
396 		ulwp->ul_sleepq = NULL;
397 		ulwp->ul_wchan = NULL;
398 		lwpid[nlwpid++] = ulwp->ul_lwpid;
399 	}
400 	if (ulwpp == NULL)
401 		atomic_and_32(rwstate, ~URW_HAS_WAITERS);
402 	if (nlwpid == 0) {
403 		queue_unlock(qp);
404 	} else {
405 		ulwp_t *self = curthread;
406 		no_preempt(self);
407 		queue_unlock(qp);
408 		if (nlwpid == 1)
409 			(void) __lwp_unpark(lwpid[0]);
410 		else
411 			(void) __lwp_unpark_all(lwpid, nlwpid);
412 		preempt(self);
413 	}
414 	if (lwpid != buffer)
415 		(void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
416 	return (nlwpid != 0);
417 }
418 
419 /*
420  * Common code for rdlock, timedrdlock, wrlock, timedwrlock, tryrdlock,
421  * and trywrlock for process-shared (USYNC_PROCESS) rwlocks.
422  *
423  * Note: if the lock appears to be contended we call __lwp_rwlock_rdlock()
424  * or __lwp_rwlock_wrlock() holding the mutex. These return with the mutex
425  * released, and if they need to sleep will release the mutex first. In the
426  * event of a spurious wakeup, these will return EAGAIN (because it is much
427  * easier for us to re-acquire the mutex here).
428  */
429 int
430 shared_rwlock_lock(rwlock_t *rwlp, timespec_t *tsp, int rd_wr)
431 {
432 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
433 	mutex_t *mp = &rwlp->mutex;
434 	uint32_t readers;
435 	int try_flag;
436 	int error;
437 
438 	try_flag = (rd_wr & TRY_FLAG);
439 	rd_wr &= ~TRY_FLAG;
440 	ASSERT(rd_wr == READ_LOCK || rd_wr == WRITE_LOCK);
441 
442 	if (!try_flag) {
443 		DTRACE_PROBE2(plockstat, rw__block, rwlp, rd_wr);
444 	}
445 
446 	do {
447 		if (try_flag && (*rwstate & URW_WRITE_LOCKED)) {
448 			error = EBUSY;
449 			break;
450 		}
451 		if ((error = mutex_lock(mp)) != 0)
452 			break;
453 		if (rd_wr == READ_LOCK) {
454 			if (read_lock_try(rwlp, 0)) {
455 				(void) mutex_unlock(mp);
456 				break;
457 			}
458 		} else {
459 			if (write_lock_try(rwlp, 0)) {
460 				(void) mutex_unlock(mp);
461 				break;
462 			}
463 		}
464 		atomic_or_32(rwstate, URW_HAS_WAITERS);
465 		readers = *rwstate;
466 		ASSERT_CONSISTENT_STATE(readers);
467 		/*
468 		 * The calls to __lwp_rwlock_*() below will release the mutex,
469 		 * so we need a dtrace probe here.
470 		 */
471 		mp->mutex_owner = 0;
472 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
473 		/*
474 		 * The waiters bit may be inaccurate.
475 		 * Only the kernel knows for sure.
476 		 */
477 		if (rd_wr == READ_LOCK) {
478 			if (try_flag)
479 				error = __lwp_rwlock_tryrdlock(rwlp);
480 			else
481 				error = __lwp_rwlock_rdlock(rwlp, tsp);
482 		} else {
483 			if (try_flag)
484 				error = __lwp_rwlock_trywrlock(rwlp);
485 			else
486 				error = __lwp_rwlock_wrlock(rwlp, tsp);
487 		}
488 	} while (error == EAGAIN || error == EINTR);
489 
490 	if (!try_flag) {
491 		DTRACE_PROBE3(plockstat, rw__blocked, rwlp, rd_wr, error == 0);
492 	}
493 
494 	return (error);
495 }
496 
497 /*
498  * Common code for rdlock, timedrdlock, wrlock, timedwrlock, tryrdlock,
499  * and trywrlock for process-private (USYNC_THREAD) rwlocks.
500  */
501 int
502 rwlock_lock(rwlock_t *rwlp, timespec_t *tsp, int rd_wr)
503 {
504 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
505 	uint32_t readers;
506 	ulwp_t *self = curthread;
507 	queue_head_t *qp;
508 	ulwp_t *ulwp;
509 	int try_flag;
510 	int ignore_waiters_flag;
511 	int error = 0;
512 
513 	try_flag = (rd_wr & TRY_FLAG);
514 	rd_wr &= ~TRY_FLAG;
515 	ASSERT(rd_wr == READ_LOCK || rd_wr == WRITE_LOCK);
516 
517 	if (!try_flag) {
518 		DTRACE_PROBE2(plockstat, rw__block, rwlp, rd_wr);
519 	}
520 
521 	qp = queue_lock(rwlp, MX);
522 	/* initial attempt to acquire the lock fails if there are waiters */
523 	ignore_waiters_flag = 0;
524 	while (error == 0) {
525 		if (rd_wr == READ_LOCK) {
526 			if (read_lock_try(rwlp, ignore_waiters_flag))
527 				break;
528 		} else {
529 			if (write_lock_try(rwlp, ignore_waiters_flag))
530 				break;
531 		}
532 		/* subsequent attempts do not fail due to waiters */
533 		ignore_waiters_flag = 1;
534 		atomic_or_32(rwstate, URW_HAS_WAITERS);
535 		readers = *rwstate;
536 		ASSERT_CONSISTENT_STATE(readers);
537 		if ((readers & URW_WRITE_LOCKED) ||
538 		    (rd_wr == WRITE_LOCK &&
539 		    (readers & URW_READERS_MASK) != 0))
540 			/* EMPTY */;	/* somebody holds the lock */
541 		else if ((ulwp = queue_waiter(qp)) == NULL) {
542 			atomic_and_32(rwstate, ~URW_HAS_WAITERS);
543 			continue;	/* no queued waiters, try again */
544 		} else {
545 			/*
546 			 * Do a priority check on the queued waiter (the
547 			 * highest priority thread on the queue) to see
548 			 * if we should defer to him or just grab the lock.
549 			 */
550 			int our_pri = real_priority(self);
551 			int his_pri = real_priority(ulwp);
552 
553 			if (rd_wr == WRITE_LOCK) {
554 				/*
555 				 * We defer to a queued thread that has
556 				 * a higher priority than ours.
557 				 */
558 				if (his_pri <= our_pri)
559 					continue;	/* try again */
560 			} else {
561 				/*
562 				 * We defer to a queued thread that has
563 				 * a higher priority than ours or that
564 				 * is a writer whose priority equals ours.
565 				 */
566 				if (his_pri < our_pri ||
567 				    (his_pri == our_pri && !ulwp->ul_writer))
568 					continue;	/* try again */
569 			}
570 		}
571 		/*
572 		 * We are about to block.
573 		 * If we're doing a trylock, return EBUSY instead.
574 		 */
575 		if (try_flag) {
576 			error = EBUSY;
577 			break;
578 		}
579 		/*
580 		 * Enqueue writers ahead of readers.
581 		 */
582 		self->ul_writer = rd_wr;	/* *must* be 0 or 1 */
583 		enqueue(qp, self, 0);
584 		set_parking_flag(self, 1);
585 		queue_unlock(qp);
586 		if ((error = __lwp_park(tsp, 0)) == EINTR)
587 			error = ignore_waiters_flag = 0;
588 		set_parking_flag(self, 0);
589 		qp = queue_lock(rwlp, MX);
590 		if (self->ul_sleepq && dequeue_self(qp) == 0)
591 			atomic_and_32(rwstate, ~URW_HAS_WAITERS);
592 		self->ul_writer = 0;
593 	}
594 
595 	queue_unlock(qp);
596 
597 	if (!try_flag) {
598 		DTRACE_PROBE3(plockstat, rw__blocked, rwlp, rd_wr, error == 0);
599 	}
600 
601 	return (error);
602 }
603 
604 int
605 rw_rdlock_impl(rwlock_t *rwlp, timespec_t *tsp)
606 {
607 	ulwp_t *self = curthread;
608 	uberdata_t *udp = self->ul_uberdata;
609 	readlock_t *readlockp;
610 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
611 	int error;
612 
613 	/*
614 	 * If we already hold a readers lock on this rwlock,
615 	 * just increment our reference count and return.
616 	 */
617 	sigoff(self);
618 	readlockp = rwl_entry(rwlp);
619 	if (readlockp->rd_count != 0) {
620 		if (readlockp->rd_count == READ_LOCK_MAX) {
621 			sigon(self);
622 			error = EAGAIN;
623 			goto out;
624 		}
625 		sigon(self);
626 		error = 0;
627 		goto out;
628 	}
629 	sigon(self);
630 
631 	/*
632 	 * If we hold the writer lock, bail out.
633 	 */
634 	if (rw_write_held(rwlp)) {
635 		if (self->ul_error_detection)
636 			rwlock_error(rwlp, "rwlock_rdlock",
637 			    "calling thread owns the writer lock");
638 		error = EDEADLK;
639 		goto out;
640 	}
641 
642 	if (read_lock_try(rwlp, 0))
643 		error = 0;
644 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
645 		error = shared_rwlock_lock(rwlp, tsp, READ_LOCK);
646 	else						/* user-level */
647 		error = rwlock_lock(rwlp, tsp, READ_LOCK);
648 
649 out:
650 	if (error == 0) {
651 		sigoff(self);
652 		rwl_entry(rwlp)->rd_count++;
653 		sigon(self);
654 		if (rwsp)
655 			tdb_incr(rwsp->rw_rdlock);
656 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, READ_LOCK);
657 	} else {
658 		DTRACE_PROBE3(plockstat, rw__error, rwlp, READ_LOCK, error);
659 	}
660 
661 	return (error);
662 }
663 
664 #pragma weak pthread_rwlock_rdlock = rw_rdlock
665 #pragma weak _rw_rdlock = rw_rdlock
666 int
667 rw_rdlock(rwlock_t *rwlp)
668 {
669 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
670 	return (rw_rdlock_impl(rwlp, NULL));
671 }
672 
673 void
674 lrw_rdlock(rwlock_t *rwlp)
675 {
676 	enter_critical(curthread);
677 	(void) rw_rdlock_impl(rwlp, NULL);
678 }
679 
680 int
681 pthread_rwlock_reltimedrdlock_np(pthread_rwlock_t *_RESTRICT_KYWD rwlp,
682     const struct timespec *_RESTRICT_KYWD reltime)
683 {
684 	timespec_t tslocal = *reltime;
685 	int error;
686 
687 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
688 	error = rw_rdlock_impl((rwlock_t *)rwlp, &tslocal);
689 	if (error == ETIME)
690 		error = ETIMEDOUT;
691 	return (error);
692 }
693 
694 int
695 pthread_rwlock_timedrdlock(pthread_rwlock_t *_RESTRICT_KYWD rwlp,
696     const struct timespec *_RESTRICT_KYWD abstime)
697 {
698 	timespec_t tslocal;
699 	int error;
700 
701 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
702 	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
703 	error = rw_rdlock_impl((rwlock_t *)rwlp, &tslocal);
704 	if (error == ETIME)
705 		error = ETIMEDOUT;
706 	return (error);
707 }
708 
709 int
710 rw_wrlock_impl(rwlock_t *rwlp, timespec_t *tsp)
711 {
712 	ulwp_t *self = curthread;
713 	uberdata_t *udp = self->ul_uberdata;
714 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
715 	int error;
716 
717 	/*
718 	 * If we hold a readers lock on this rwlock, bail out.
719 	 */
720 	if (rw_read_held(rwlp)) {
721 		if (self->ul_error_detection)
722 			rwlock_error(rwlp, "rwlock_wrlock",
723 			    "calling thread owns the readers lock");
724 		error = EDEADLK;
725 		goto out;
726 	}
727 
728 	/*
729 	 * If we hold the writer lock, bail out.
730 	 */
731 	if (rw_write_held(rwlp)) {
732 		if (self->ul_error_detection)
733 			rwlock_error(rwlp, "rwlock_wrlock",
734 			    "calling thread owns the writer lock");
735 		error = EDEADLK;
736 		goto out;
737 	}
738 
739 	if (write_lock_try(rwlp, 0))
740 		error = 0;
741 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
742 		error = shared_rwlock_lock(rwlp, tsp, WRITE_LOCK);
743 	else						/* user-level */
744 		error = rwlock_lock(rwlp, tsp, WRITE_LOCK);
745 
746 out:
747 	if (error == 0) {
748 		rwlp->rwlock_owner = (uintptr_t)self;
749 		if (rwlp->rwlock_type == USYNC_PROCESS)
750 			rwlp->rwlock_ownerpid = udp->pid;
751 		if (rwsp) {
752 			tdb_incr(rwsp->rw_wrlock);
753 			rwsp->rw_wrlock_begin_hold = gethrtime();
754 		}
755 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, WRITE_LOCK);
756 	} else {
757 		DTRACE_PROBE3(plockstat, rw__error, rwlp, WRITE_LOCK, error);
758 	}
759 	return (error);
760 }
761 
762 #pragma weak pthread_rwlock_wrlock = rw_wrlock
763 #pragma weak _rw_wrlock = rw_wrlock
764 int
765 rw_wrlock(rwlock_t *rwlp)
766 {
767 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
768 	return (rw_wrlock_impl(rwlp, NULL));
769 }
770 
771 void
772 lrw_wrlock(rwlock_t *rwlp)
773 {
774 	enter_critical(curthread);
775 	(void) rw_wrlock_impl(rwlp, NULL);
776 }
777 
778 int
779 pthread_rwlock_reltimedwrlock_np(pthread_rwlock_t *_RESTRICT_KYWD rwlp,
780     const struct timespec *_RESTRICT_KYWD reltime)
781 {
782 	timespec_t tslocal = *reltime;
783 	int error;
784 
785 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
786 	error = rw_wrlock_impl((rwlock_t *)rwlp, &tslocal);
787 	if (error == ETIME)
788 		error = ETIMEDOUT;
789 	return (error);
790 }
791 
792 int
793 pthread_rwlock_timedwrlock(pthread_rwlock_t *rwlp, const timespec_t *abstime)
794 {
795 	timespec_t tslocal;
796 	int error;
797 
798 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
799 	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
800 	error = rw_wrlock_impl((rwlock_t *)rwlp, &tslocal);
801 	if (error == ETIME)
802 		error = ETIMEDOUT;
803 	return (error);
804 }
805 
806 #pragma weak pthread_rwlock_tryrdlock = rw_tryrdlock
807 int
808 rw_tryrdlock(rwlock_t *rwlp)
809 {
810 	ulwp_t *self = curthread;
811 	uberdata_t *udp = self->ul_uberdata;
812 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
813 	readlock_t *readlockp;
814 	int error;
815 
816 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
817 
818 	if (rwsp)
819 		tdb_incr(rwsp->rw_rdlock_try);
820 
821 	/*
822 	 * If we already hold a readers lock on this rwlock,
823 	 * just increment our reference count and return.
824 	 */
825 	sigoff(self);
826 	readlockp = rwl_entry(rwlp);
827 	if (readlockp->rd_count != 0) {
828 		if (readlockp->rd_count == READ_LOCK_MAX) {
829 			sigon(self);
830 			error = EAGAIN;
831 			goto out;
832 		}
833 		sigon(self);
834 		error = 0;
835 		goto out;
836 	}
837 	sigon(self);
838 
839 	if (read_lock_try(rwlp, 0))
840 		error = 0;
841 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
842 		error = shared_rwlock_lock(rwlp, NULL, READ_LOCK_TRY);
843 	else						/* user-level */
844 		error = rwlock_lock(rwlp, NULL, READ_LOCK_TRY);
845 
846 out:
847 	if (error == 0) {
848 		sigoff(self);
849 		rwl_entry(rwlp)->rd_count++;
850 		sigon(self);
851 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, READ_LOCK);
852 	} else {
853 		if (rwsp)
854 			tdb_incr(rwsp->rw_rdlock_try_fail);
855 		if (error != EBUSY) {
856 			DTRACE_PROBE3(plockstat, rw__error, rwlp, READ_LOCK,
857 			    error);
858 		}
859 	}
860 
861 	return (error);
862 }
863 
864 #pragma weak pthread_rwlock_trywrlock = rw_trywrlock
865 int
866 rw_trywrlock(rwlock_t *rwlp)
867 {
868 	ulwp_t *self = curthread;
869 	uberdata_t *udp = self->ul_uberdata;
870 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
871 	int error;
872 
873 	ASSERT(!self->ul_critical || self->ul_bindflags);
874 
875 	if (rwsp)
876 		tdb_incr(rwsp->rw_wrlock_try);
877 
878 	if (write_lock_try(rwlp, 0))
879 		error = 0;
880 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
881 		error = shared_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY);
882 	else						/* user-level */
883 		error = rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY);
884 
885 	if (error == 0) {
886 		rwlp->rwlock_owner = (uintptr_t)self;
887 		if (rwlp->rwlock_type == USYNC_PROCESS)
888 			rwlp->rwlock_ownerpid = udp->pid;
889 		if (rwsp)
890 			rwsp->rw_wrlock_begin_hold = gethrtime();
891 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, WRITE_LOCK);
892 	} else {
893 		if (rwsp)
894 			tdb_incr(rwsp->rw_wrlock_try_fail);
895 		if (error != EBUSY) {
896 			DTRACE_PROBE3(plockstat, rw__error, rwlp, WRITE_LOCK,
897 			    error);
898 		}
899 	}
900 	return (error);
901 }
902 
903 #pragma weak pthread_rwlock_unlock = rw_unlock
904 #pragma weak _rw_unlock = rw_unlock
905 int
906 rw_unlock(rwlock_t *rwlp)
907 {
908 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
909 	uint32_t readers;
910 	ulwp_t *self = curthread;
911 	uberdata_t *udp = self->ul_uberdata;
912 	tdb_rwlock_stats_t *rwsp;
913 	queue_head_t *qp;
914 	int rd_wr;
915 	int waked = 0;
916 
917 	readers = *rwstate;
918 	ASSERT_CONSISTENT_STATE(readers);
919 	if (readers & URW_WRITE_LOCKED) {
920 		rd_wr = WRITE_LOCK;
921 		readers = 0;
922 	} else {
923 		rd_wr = READ_LOCK;
924 		readers &= URW_READERS_MASK;
925 	}
926 
927 	if (rd_wr == WRITE_LOCK) {
928 		/*
929 		 * Since the writer lock is held, we'd better be
930 		 * holding it, else we cannot legitimately be here.
931 		 */
932 		if (!rw_write_held(rwlp)) {
933 			if (self->ul_error_detection)
934 				rwlock_error(rwlp, "rwlock_unlock",
935 				    "writer lock held, "
936 				    "but not by the calling thread");
937 			return (EPERM);
938 		}
939 		if ((rwsp = RWLOCK_STATS(rwlp, udp)) != NULL) {
940 			if (rwsp->rw_wrlock_begin_hold)
941 				rwsp->rw_wrlock_hold_time +=
942 				    gethrtime() - rwsp->rw_wrlock_begin_hold;
943 			rwsp->rw_wrlock_begin_hold = 0;
944 		}
945 		rwlp->rwlock_owner = 0;
946 		rwlp->rwlock_ownerpid = 0;
947 	} else if (readers > 0) {
948 		/*
949 		 * A readers lock is held; if we don't hold one, bail out.
950 		 */
951 		readlock_t *readlockp;
952 
953 		sigoff(self);
954 		readlockp = rwl_entry(rwlp);
955 		if (readlockp->rd_count == 0) {
956 			sigon(self);
957 			if (self->ul_error_detection)
958 				rwlock_error(rwlp, "rwlock_unlock",
959 				    "readers lock held, "
960 				    "but not by the calling thread");
961 			return (EPERM);
962 		}
963 		/*
964 		 * If we hold more than one readers lock on this rwlock,
965 		 * just decrement our reference count and return.
966 		 */
967 		if (--readlockp->rd_count != 0) {
968 			sigon(self);
969 			goto out;
970 		}
971 		sigon(self);
972 	} else {
973 		/*
974 		 * This is a usage error.
975 		 * No thread should release an unowned lock.
976 		 */
977 		if (self->ul_error_detection)
978 			rwlock_error(rwlp, "rwlock_unlock", "lock not owned");
979 		return (EPERM);
980 	}
981 
982 	if (rd_wr == WRITE_LOCK && write_unlock_try(rwlp)) {
983 		/* EMPTY */;
984 	} else if (rd_wr == READ_LOCK && read_unlock_try(rwlp)) {
985 		/* EMPTY */;
986 	} else if (rwlp->rwlock_type == USYNC_PROCESS) {
987 		(void) mutex_lock(&rwlp->mutex);
988 		(void) __lwp_rwlock_unlock(rwlp);
989 		(void) mutex_unlock(&rwlp->mutex);
990 		waked = 1;
991 	} else {
992 		qp = queue_lock(rwlp, MX);
993 		if (rd_wr == READ_LOCK)
994 			atomic_dec_32(rwstate);
995 		else
996 			atomic_and_32(rwstate, ~URW_WRITE_LOCKED);
997 		waked = rw_queue_release(qp, rwlp);
998 	}
999 
1000 out:
1001 	DTRACE_PROBE2(plockstat, rw__release, rwlp, rd_wr);
1002 
1003 	/*
1004 	 * Yield to the thread we just waked up, just in case we might
1005 	 * be about to grab the rwlock again immediately upon return.
1006 	 * This is pretty weak but it helps on a uniprocessor and also
1007 	 * when cpu affinity has assigned both ourself and the other
1008 	 * thread to the same CPU.  Note that lwp_yield() will yield
1009 	 * the processor only if the writer is at the same or higher
1010 	 * priority than ourself.  This provides more balanced program
1011 	 * behavior; it doesn't guarantee acquisition of the lock by
1012 	 * the pending writer.
1013 	 */
1014 	if (waked)
1015 		yield();
1016 	return (0);
1017 }
1018 
1019 void
1020 lrw_unlock(rwlock_t *rwlp)
1021 {
1022 	(void) rw_unlock(rwlp);
1023 	exit_critical(curthread);
1024 }
1025