xref: /titanic_44/usr/src/lib/libc/port/threads/rwlock.c (revision 66e150d7d3c0cb2de3c45c74612784ffd3e73de6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include "lint.h"
30 #include "thr_uberdata.h"
31 #include <sys/sdt.h>
32 
33 #define	TRY_FLAG		0x10
34 #define	READ_LOCK		0
35 #define	WRITE_LOCK		1
36 #define	READ_LOCK_TRY		(READ_LOCK | TRY_FLAG)
37 #define	WRITE_LOCK_TRY		(WRITE_LOCK | TRY_FLAG)
38 
39 #define	NLOCKS	4	/* initial number of readlock_t structs allocated */
40 
41 #define	ASSERT_CONSISTENT_STATE(readers)		\
42 	ASSERT(!((readers) & URW_WRITE_LOCKED) ||	\
43 		((readers) & ~URW_HAS_WAITERS) == URW_WRITE_LOCKED)
44 
45 /*
46  * Find/allocate an entry for rwlp in our array of rwlocks held for reading.
47  * We must be deferring signals for this to be safe.
48  * Else if we are returning an entry with ul_rdlockcnt == 0,
49  * it could be reassigned behind our back in a signal handler.
50  */
51 static readlock_t *
52 rwl_entry(rwlock_t *rwlp)
53 {
54 	ulwp_t *self = curthread;
55 	readlock_t *remembered = NULL;
56 	readlock_t *readlockp;
57 	uint_t nlocks;
58 
59 	/* we must be deferring signals */
60 	ASSERT((self->ul_critical + self->ul_sigdefer) != 0);
61 
62 	if ((nlocks = self->ul_rdlockcnt) != 0)
63 		readlockp = self->ul_readlock.array;
64 	else {
65 		nlocks = 1;
66 		readlockp = &self->ul_readlock.single;
67 	}
68 
69 	for (; nlocks; nlocks--, readlockp++) {
70 		if (readlockp->rd_rwlock == rwlp)
71 			return (readlockp);
72 		if (readlockp->rd_count == 0 && remembered == NULL)
73 			remembered = readlockp;
74 	}
75 	if (remembered != NULL) {
76 		remembered->rd_rwlock = rwlp;
77 		return (remembered);
78 	}
79 
80 	/*
81 	 * No entry available.  Allocate more space, converting the single
82 	 * readlock_t entry into an array of readlock_t entries if necessary.
83 	 */
84 	if ((nlocks = self->ul_rdlockcnt) == 0) {
85 		/*
86 		 * Initial allocation of the readlock_t array.
87 		 * Convert the single entry into an array.
88 		 */
89 		self->ul_rdlockcnt = nlocks = NLOCKS;
90 		readlockp = lmalloc(nlocks * sizeof (readlock_t));
91 		/*
92 		 * The single readlock_t becomes the first entry in the array.
93 		 */
94 		*readlockp = self->ul_readlock.single;
95 		self->ul_readlock.single.rd_count = 0;
96 		self->ul_readlock.array = readlockp;
97 		/*
98 		 * Return the next available entry in the array.
99 		 */
100 		(++readlockp)->rd_rwlock = rwlp;
101 		return (readlockp);
102 	}
103 	/*
104 	 * Reallocate the array, double the size each time.
105 	 */
106 	readlockp = lmalloc(nlocks * 2 * sizeof (readlock_t));
107 	(void) memcpy(readlockp, self->ul_readlock.array,
108 	    nlocks * sizeof (readlock_t));
109 	lfree(self->ul_readlock.array, nlocks * sizeof (readlock_t));
110 	self->ul_readlock.array = readlockp;
111 	self->ul_rdlockcnt *= 2;
112 	/*
113 	 * Return the next available entry in the newly allocated array.
114 	 */
115 	(readlockp += nlocks)->rd_rwlock = rwlp;
116 	return (readlockp);
117 }
118 
119 /*
120  * Free the array of rwlocks held for reading.
121  */
122 void
123 rwl_free(ulwp_t *ulwp)
124 {
125 	uint_t nlocks;
126 
127 	if ((nlocks = ulwp->ul_rdlockcnt) != 0)
128 		lfree(ulwp->ul_readlock.array, nlocks * sizeof (readlock_t));
129 	ulwp->ul_rdlockcnt = 0;
130 	ulwp->ul_readlock.single.rd_rwlock = NULL;
131 	ulwp->ul_readlock.single.rd_count = 0;
132 }
133 
134 /*
135  * Check if a reader version of the lock is held by the current thread.
136  */
137 #pragma weak _rw_read_held = rw_read_held
138 int
139 rw_read_held(rwlock_t *rwlp)
140 {
141 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
142 	uint32_t readers;
143 	ulwp_t *self = curthread;
144 	readlock_t *readlockp;
145 	uint_t nlocks;
146 	int rval = 0;
147 
148 	no_preempt(self);
149 
150 	readers = *rwstate;
151 	ASSERT_CONSISTENT_STATE(readers);
152 	if (!(readers & URW_WRITE_LOCKED) &&
153 	    (readers & URW_READERS_MASK) != 0) {
154 		/*
155 		 * The lock is held for reading by some thread.
156 		 * Search our array of rwlocks held for reading for a match.
157 		 */
158 		if ((nlocks = self->ul_rdlockcnt) != 0)
159 			readlockp = self->ul_readlock.array;
160 		else {
161 			nlocks = 1;
162 			readlockp = &self->ul_readlock.single;
163 		}
164 		for (; nlocks; nlocks--, readlockp++) {
165 			if (readlockp->rd_rwlock == rwlp) {
166 				if (readlockp->rd_count)
167 					rval = 1;
168 				break;
169 			}
170 		}
171 	}
172 
173 	preempt(self);
174 	return (rval);
175 }
176 
177 /*
178  * Check if a writer version of the lock is held by the current thread.
179  */
180 #pragma weak _rw_write_held = rw_write_held
181 int
182 rw_write_held(rwlock_t *rwlp)
183 {
184 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
185 	uint32_t readers;
186 	ulwp_t *self = curthread;
187 	int rval;
188 
189 	no_preempt(self);
190 
191 	readers = *rwstate;
192 	ASSERT_CONSISTENT_STATE(readers);
193 	rval = ((readers & URW_WRITE_LOCKED) &&
194 	    rwlp->rwlock_owner == (uintptr_t)self &&
195 	    (rwlp->rwlock_type == USYNC_THREAD ||
196 	    rwlp->rwlock_ownerpid == self->ul_uberdata->pid));
197 
198 	preempt(self);
199 	return (rval);
200 }
201 
202 #pragma weak _rwlock_init = rwlock_init
203 /* ARGSUSED2 */
204 int
205 rwlock_init(rwlock_t *rwlp, int type, void *arg)
206 {
207 	ulwp_t *self = curthread;
208 
209 	if (type != USYNC_THREAD && type != USYNC_PROCESS)
210 		return (EINVAL);
211 	/*
212 	 * Once reinitialized, we can no longer be holding a read or write lock.
213 	 * We can do nothing about other threads that are holding read locks.
214 	 */
215 	sigoff(self);
216 	rwl_entry(rwlp)->rd_count = 0;
217 	sigon(self);
218 	(void) memset(rwlp, 0, sizeof (*rwlp));
219 	rwlp->rwlock_type = (uint16_t)type;
220 	rwlp->rwlock_magic = RWL_MAGIC;
221 	rwlp->mutex.mutex_type = (uint8_t)type;
222 	rwlp->mutex.mutex_flag = LOCK_INITED;
223 	rwlp->mutex.mutex_magic = MUTEX_MAGIC;
224 
225 	/*
226 	 * This should be at the beginning of the function,
227 	 * but for the sake of old broken applications that
228 	 * do not have proper alignment for their rwlocks
229 	 * (and don't check the return code from rwlock_init),
230 	 * we put it here, after initializing the rwlock regardless.
231 	 */
232 	if (((uintptr_t)rwlp & (_LONG_LONG_ALIGNMENT - 1)) &&
233 	    self->ul_misaligned == 0)
234 		return (EINVAL);
235 
236 	return (0);
237 }
238 
239 #pragma weak pthread_rwlock_destroy = rwlock_destroy
240 #pragma weak _rwlock_destroy = rwlock_destroy
241 int
242 rwlock_destroy(rwlock_t *rwlp)
243 {
244 	/*
245 	 * Once destroyed, we can no longer be holding a read or write lock.
246 	 * We can do nothing about other threads that are holding read locks.
247 	 */
248 	sigoff(curthread);
249 	rwl_entry(rwlp)->rd_count = 0;
250 	sigon(curthread);
251 	rwlp->rwlock_magic = 0;
252 	tdb_sync_obj_deregister(rwlp);
253 	return (0);
254 }
255 
256 /*
257  * Attempt to acquire a readers lock.  Return true on success.
258  */
259 static int
260 read_lock_try(rwlock_t *rwlp, int ignore_waiters_flag)
261 {
262 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
263 	uint32_t mask = ignore_waiters_flag?
264 	    URW_WRITE_LOCKED : (URW_HAS_WAITERS | URW_WRITE_LOCKED);
265 	uint32_t readers;
266 	ulwp_t *self = curthread;
267 
268 	no_preempt(self);
269 	while (((readers = *rwstate) & mask) == 0) {
270 		if (atomic_cas_32(rwstate, readers, readers + 1) == readers) {
271 			preempt(self);
272 			return (1);
273 		}
274 	}
275 	preempt(self);
276 	return (0);
277 }
278 
279 /*
280  * Attempt to release a reader lock.  Return true on success.
281  */
282 static int
283 read_unlock_try(rwlock_t *rwlp)
284 {
285 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
286 	uint32_t readers;
287 	ulwp_t *self = curthread;
288 
289 	no_preempt(self);
290 	while (((readers = *rwstate) & URW_HAS_WAITERS) == 0) {
291 		if (atomic_cas_32(rwstate, readers, readers - 1) == readers) {
292 			preempt(self);
293 			return (1);
294 		}
295 	}
296 	preempt(self);
297 	return (0);
298 }
299 
300 /*
301  * Attempt to acquire a writer lock.  Return true on success.
302  */
303 static int
304 write_lock_try(rwlock_t *rwlp, int ignore_waiters_flag)
305 {
306 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
307 	uint32_t mask = ignore_waiters_flag?
308 	    (URW_WRITE_LOCKED | URW_READERS_MASK) :
309 	    (URW_HAS_WAITERS | URW_WRITE_LOCKED | URW_READERS_MASK);
310 	ulwp_t *self = curthread;
311 	uint32_t readers;
312 
313 	no_preempt(self);
314 	while (((readers = *rwstate) & mask) == 0) {
315 		if (atomic_cas_32(rwstate, readers, readers | URW_WRITE_LOCKED)
316 		    == readers) {
317 			preempt(self);
318 			return (1);
319 		}
320 	}
321 	preempt(self);
322 	return (0);
323 }
324 
325 /*
326  * Attempt to release a writer lock.  Return true on success.
327  */
328 static int
329 write_unlock_try(rwlock_t *rwlp)
330 {
331 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
332 	uint32_t readers;
333 	ulwp_t *self = curthread;
334 
335 	no_preempt(self);
336 	while (((readers = *rwstate) & URW_HAS_WAITERS) == 0) {
337 		if (atomic_cas_32(rwstate, readers, 0) == readers) {
338 			preempt(self);
339 			return (1);
340 		}
341 	}
342 	preempt(self);
343 	return (0);
344 }
345 
346 /*
347  * Wake up thread(s) sleeping on the rwlock queue and then
348  * drop the queue lock.  Return non-zero if we wake up someone.
349  * This is called when a thread releases a lock that appears to have waiters.
350  */
351 static int
352 rw_queue_release(queue_head_t *qp, rwlock_t *rwlp)
353 {
354 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
355 	uint32_t readers;
356 	uint32_t writers;
357 	ulwp_t **ulwpp;
358 	ulwp_t *ulwp;
359 	ulwp_t *prev;
360 	int nlwpid = 0;
361 	int more;
362 	int maxlwps = MAXLWPS;
363 	lwpid_t buffer[MAXLWPS];
364 	lwpid_t *lwpid = buffer;
365 
366 	readers = *rwstate;
367 	ASSERT_CONSISTENT_STATE(readers);
368 	if (!(readers & URW_HAS_WAITERS)) {
369 		queue_unlock(qp);
370 		return (0);
371 	}
372 	readers &= URW_READERS_MASK;
373 	writers = 0;
374 
375 	/*
376 	 * Examine the queue of waiters in priority order and prepare
377 	 * to wake up as many readers as we encounter before encountering
378 	 * a writer.  If the highest priority thread on the queue is a
379 	 * writer, stop there and wake it up.
380 	 *
381 	 * We keep track of lwpids that are to be unparked in lwpid[].
382 	 * __lwp_unpark_all() is called to unpark all of them after
383 	 * they have been removed from the sleep queue and the sleep
384 	 * queue lock has been dropped.  If we run out of space in our
385 	 * on-stack buffer, we need to allocate more but we can't call
386 	 * lmalloc() because we are holding a queue lock when the overflow
387 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
388 	 * either because the application may have allocated a small
389 	 * stack and we don't want to overrun the stack.  So we call
390 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
391 	 * system call directly since that path acquires no locks.
392 	 */
393 	while ((ulwpp = queue_slot(qp, &prev, &more)) != NULL) {
394 		ulwp = *ulwpp;
395 		ASSERT(ulwp->ul_wchan == rwlp);
396 		if (ulwp->ul_writer) {
397 			if (writers != 0 || readers != 0)
398 				break;
399 			/* one writer to wake */
400 			writers++;
401 		} else {
402 			if (writers != 0)
403 				break;
404 			/* at least one reader to wake */
405 			readers++;
406 			if (nlwpid == maxlwps)
407 				lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
408 		}
409 		queue_unlink(qp, ulwpp, prev);
410 		ulwp->ul_sleepq = NULL;
411 		ulwp->ul_wchan = NULL;
412 		lwpid[nlwpid++] = ulwp->ul_lwpid;
413 	}
414 	if (ulwpp == NULL)
415 		atomic_and_32(rwstate, ~URW_HAS_WAITERS);
416 	if (nlwpid == 0) {
417 		queue_unlock(qp);
418 	} else {
419 		ulwp_t *self = curthread;
420 		no_preempt(self);
421 		queue_unlock(qp);
422 		if (nlwpid == 1)
423 			(void) __lwp_unpark(lwpid[0]);
424 		else
425 			(void) __lwp_unpark_all(lwpid, nlwpid);
426 		preempt(self);
427 	}
428 	if (lwpid != buffer)
429 		(void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
430 	return (nlwpid != 0);
431 }
432 
433 /*
434  * Common code for rdlock, timedrdlock, wrlock, timedwrlock, tryrdlock,
435  * and trywrlock for process-shared (USYNC_PROCESS) rwlocks.
436  *
437  * Note: if the lock appears to be contended we call __lwp_rwlock_rdlock()
438  * or __lwp_rwlock_wrlock() holding the mutex. These return with the mutex
439  * released, and if they need to sleep will release the mutex first. In the
440  * event of a spurious wakeup, these will return EAGAIN (because it is much
441  * easier for us to re-acquire the mutex here).
442  */
443 int
444 shared_rwlock_lock(rwlock_t *rwlp, timespec_t *tsp, int rd_wr)
445 {
446 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
447 	mutex_t *mp = &rwlp->mutex;
448 	uint32_t readers;
449 	int try_flag;
450 	int error;
451 
452 	try_flag = (rd_wr & TRY_FLAG);
453 	rd_wr &= ~TRY_FLAG;
454 	ASSERT(rd_wr == READ_LOCK || rd_wr == WRITE_LOCK);
455 
456 	if (!try_flag) {
457 		DTRACE_PROBE2(plockstat, rw__block, rwlp, rd_wr);
458 	}
459 
460 	do {
461 		if (try_flag && (*rwstate & URW_WRITE_LOCKED)) {
462 			error = EBUSY;
463 			break;
464 		}
465 		if ((error = mutex_lock(mp)) != 0)
466 			break;
467 		if (rd_wr == READ_LOCK) {
468 			if (read_lock_try(rwlp, 0)) {
469 				(void) mutex_unlock(mp);
470 				break;
471 			}
472 		} else {
473 			if (write_lock_try(rwlp, 0)) {
474 				(void) mutex_unlock(mp);
475 				break;
476 			}
477 		}
478 		atomic_or_32(rwstate, URW_HAS_WAITERS);
479 		readers = *rwstate;
480 		ASSERT_CONSISTENT_STATE(readers);
481 		/*
482 		 * The calls to __lwp_rwlock_*() below will release the mutex,
483 		 * so we need a dtrace probe here.
484 		 */
485 		mp->mutex_owner = 0;
486 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
487 		/*
488 		 * The waiters bit may be inaccurate.
489 		 * Only the kernel knows for sure.
490 		 */
491 		if (rd_wr == READ_LOCK) {
492 			if (try_flag)
493 				error = __lwp_rwlock_tryrdlock(rwlp);
494 			else
495 				error = __lwp_rwlock_rdlock(rwlp, tsp);
496 		} else {
497 			if (try_flag)
498 				error = __lwp_rwlock_trywrlock(rwlp);
499 			else
500 				error = __lwp_rwlock_wrlock(rwlp, tsp);
501 		}
502 	} while (error == EAGAIN || error == EINTR);
503 
504 	if (!try_flag) {
505 		DTRACE_PROBE3(plockstat, rw__blocked, rwlp, rd_wr, error == 0);
506 	}
507 
508 	return (error);
509 }
510 
511 /*
512  * Common code for rdlock, timedrdlock, wrlock, timedwrlock, tryrdlock,
513  * and trywrlock for process-private (USYNC_THREAD) rwlocks.
514  */
515 int
516 rwlock_lock(rwlock_t *rwlp, timespec_t *tsp, int rd_wr)
517 {
518 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
519 	uint32_t readers;
520 	ulwp_t *self = curthread;
521 	queue_head_t *qp;
522 	ulwp_t *ulwp;
523 	int try_flag;
524 	int ignore_waiters_flag;
525 	int error = 0;
526 
527 	try_flag = (rd_wr & TRY_FLAG);
528 	rd_wr &= ~TRY_FLAG;
529 	ASSERT(rd_wr == READ_LOCK || rd_wr == WRITE_LOCK);
530 
531 	if (!try_flag) {
532 		DTRACE_PROBE2(plockstat, rw__block, rwlp, rd_wr);
533 	}
534 
535 	qp = queue_lock(rwlp, MX);
536 	/* initial attempt to acquire the lock fails if there are waiters */
537 	ignore_waiters_flag = 0;
538 	while (error == 0) {
539 		if (rd_wr == READ_LOCK) {
540 			if (read_lock_try(rwlp, ignore_waiters_flag))
541 				break;
542 		} else {
543 			if (write_lock_try(rwlp, ignore_waiters_flag))
544 				break;
545 		}
546 		/* subsequent attempts do not fail due to waiters */
547 		ignore_waiters_flag = 1;
548 		atomic_or_32(rwstate, URW_HAS_WAITERS);
549 		readers = *rwstate;
550 		ASSERT_CONSISTENT_STATE(readers);
551 		if ((readers & URW_WRITE_LOCKED) ||
552 		    (rd_wr == WRITE_LOCK &&
553 		    (readers & URW_READERS_MASK) != 0))
554 			/* EMPTY */;	/* somebody holds the lock */
555 		else if ((ulwp = queue_waiter(qp)) == NULL) {
556 			atomic_and_32(rwstate, ~URW_HAS_WAITERS);
557 			continue;	/* no queued waiters, try again */
558 		} else {
559 			/*
560 			 * Do a priority check on the queued waiter (the
561 			 * highest priority thread on the queue) to see
562 			 * if we should defer to him or just grab the lock.
563 			 */
564 			int our_pri = real_priority(self);
565 			int his_pri = real_priority(ulwp);
566 
567 			if (rd_wr == WRITE_LOCK) {
568 				/*
569 				 * We defer to a queued thread that has
570 				 * a higher priority than ours.
571 				 */
572 				if (his_pri <= our_pri)
573 					continue;	/* try again */
574 			} else {
575 				/*
576 				 * We defer to a queued thread that has
577 				 * a higher priority than ours or that
578 				 * is a writer whose priority equals ours.
579 				 */
580 				if (his_pri < our_pri ||
581 				    (his_pri == our_pri && !ulwp->ul_writer))
582 					continue;	/* try again */
583 			}
584 		}
585 		/*
586 		 * We are about to block.
587 		 * If we're doing a trylock, return EBUSY instead.
588 		 */
589 		if (try_flag) {
590 			error = EBUSY;
591 			break;
592 		}
593 		/*
594 		 * Enqueue writers ahead of readers.
595 		 */
596 		self->ul_writer = rd_wr;	/* *must* be 0 or 1 */
597 		enqueue(qp, self, 0);
598 		set_parking_flag(self, 1);
599 		queue_unlock(qp);
600 		if ((error = __lwp_park(tsp, 0)) == EINTR)
601 			error = ignore_waiters_flag = 0;
602 		set_parking_flag(self, 0);
603 		qp = queue_lock(rwlp, MX);
604 		if (self->ul_sleepq && dequeue_self(qp) == 0)
605 			atomic_and_32(rwstate, ~URW_HAS_WAITERS);
606 		self->ul_writer = 0;
607 	}
608 
609 	queue_unlock(qp);
610 
611 	if (!try_flag) {
612 		DTRACE_PROBE3(plockstat, rw__blocked, rwlp, rd_wr, error == 0);
613 	}
614 
615 	return (error);
616 }
617 
618 int
619 rw_rdlock_impl(rwlock_t *rwlp, timespec_t *tsp)
620 {
621 	ulwp_t *self = curthread;
622 	uberdata_t *udp = self->ul_uberdata;
623 	readlock_t *readlockp;
624 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
625 	int error;
626 
627 	/*
628 	 * If we already hold a readers lock on this rwlock,
629 	 * just increment our reference count and return.
630 	 */
631 	sigoff(self);
632 	readlockp = rwl_entry(rwlp);
633 	if (readlockp->rd_count != 0) {
634 		if (readlockp->rd_count == READ_LOCK_MAX) {
635 			sigon(self);
636 			error = EAGAIN;
637 			goto out;
638 		}
639 		sigon(self);
640 		error = 0;
641 		goto out;
642 	}
643 	sigon(self);
644 
645 	/*
646 	 * If we hold the writer lock, bail out.
647 	 */
648 	if (rw_write_held(rwlp)) {
649 		if (self->ul_error_detection)
650 			rwlock_error(rwlp, "rwlock_rdlock",
651 			    "calling thread owns the writer lock");
652 		error = EDEADLK;
653 		goto out;
654 	}
655 
656 	if (read_lock_try(rwlp, 0))
657 		error = 0;
658 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
659 		error = shared_rwlock_lock(rwlp, tsp, READ_LOCK);
660 	else						/* user-level */
661 		error = rwlock_lock(rwlp, tsp, READ_LOCK);
662 
663 out:
664 	if (error == 0) {
665 		sigoff(self);
666 		rwl_entry(rwlp)->rd_count++;
667 		sigon(self);
668 		if (rwsp)
669 			tdb_incr(rwsp->rw_rdlock);
670 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, READ_LOCK);
671 	} else {
672 		DTRACE_PROBE3(plockstat, rw__error, rwlp, READ_LOCK, error);
673 	}
674 
675 	return (error);
676 }
677 
678 #pragma weak pthread_rwlock_rdlock = rw_rdlock
679 #pragma weak _rw_rdlock = rw_rdlock
680 int
681 rw_rdlock(rwlock_t *rwlp)
682 {
683 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
684 	return (rw_rdlock_impl(rwlp, NULL));
685 }
686 
687 void
688 lrw_rdlock(rwlock_t *rwlp)
689 {
690 	enter_critical(curthread);
691 	(void) rw_rdlock_impl(rwlp, NULL);
692 }
693 
694 int
695 pthread_rwlock_reltimedrdlock_np(pthread_rwlock_t *_RESTRICT_KYWD rwlp,
696     const struct timespec *_RESTRICT_KYWD reltime)
697 {
698 	timespec_t tslocal = *reltime;
699 	int error;
700 
701 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
702 	error = rw_rdlock_impl((rwlock_t *)rwlp, &tslocal);
703 	if (error == ETIME)
704 		error = ETIMEDOUT;
705 	return (error);
706 }
707 
708 int
709 pthread_rwlock_timedrdlock(pthread_rwlock_t *_RESTRICT_KYWD rwlp,
710     const struct timespec *_RESTRICT_KYWD abstime)
711 {
712 	timespec_t tslocal;
713 	int error;
714 
715 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
716 	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
717 	error = rw_rdlock_impl((rwlock_t *)rwlp, &tslocal);
718 	if (error == ETIME)
719 		error = ETIMEDOUT;
720 	return (error);
721 }
722 
723 int
724 rw_wrlock_impl(rwlock_t *rwlp, timespec_t *tsp)
725 {
726 	ulwp_t *self = curthread;
727 	uberdata_t *udp = self->ul_uberdata;
728 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
729 	int error;
730 
731 	/*
732 	 * If we hold a readers lock on this rwlock, bail out.
733 	 */
734 	if (rw_read_held(rwlp)) {
735 		if (self->ul_error_detection)
736 			rwlock_error(rwlp, "rwlock_wrlock",
737 			    "calling thread owns the readers lock");
738 		error = EDEADLK;
739 		goto out;
740 	}
741 
742 	/*
743 	 * If we hold the writer lock, bail out.
744 	 */
745 	if (rw_write_held(rwlp)) {
746 		if (self->ul_error_detection)
747 			rwlock_error(rwlp, "rwlock_wrlock",
748 			    "calling thread owns the writer lock");
749 		error = EDEADLK;
750 		goto out;
751 	}
752 
753 	if (write_lock_try(rwlp, 0))
754 		error = 0;
755 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
756 		error = shared_rwlock_lock(rwlp, tsp, WRITE_LOCK);
757 	else						/* user-level */
758 		error = rwlock_lock(rwlp, tsp, WRITE_LOCK);
759 
760 out:
761 	if (error == 0) {
762 		rwlp->rwlock_owner = (uintptr_t)self;
763 		if (rwlp->rwlock_type == USYNC_PROCESS)
764 			rwlp->rwlock_ownerpid = udp->pid;
765 		if (rwsp) {
766 			tdb_incr(rwsp->rw_wrlock);
767 			rwsp->rw_wrlock_begin_hold = gethrtime();
768 		}
769 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, WRITE_LOCK);
770 	} else {
771 		DTRACE_PROBE3(plockstat, rw__error, rwlp, WRITE_LOCK, error);
772 	}
773 	return (error);
774 }
775 
776 #pragma weak pthread_rwlock_wrlock = rw_wrlock
777 #pragma weak _rw_wrlock = rw_wrlock
778 int
779 rw_wrlock(rwlock_t *rwlp)
780 {
781 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
782 	return (rw_wrlock_impl(rwlp, NULL));
783 }
784 
785 void
786 lrw_wrlock(rwlock_t *rwlp)
787 {
788 	enter_critical(curthread);
789 	(void) rw_wrlock_impl(rwlp, NULL);
790 }
791 
792 int
793 pthread_rwlock_reltimedwrlock_np(pthread_rwlock_t *_RESTRICT_KYWD rwlp,
794     const struct timespec *_RESTRICT_KYWD reltime)
795 {
796 	timespec_t tslocal = *reltime;
797 	int error;
798 
799 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
800 	error = rw_wrlock_impl((rwlock_t *)rwlp, &tslocal);
801 	if (error == ETIME)
802 		error = ETIMEDOUT;
803 	return (error);
804 }
805 
806 int
807 pthread_rwlock_timedwrlock(pthread_rwlock_t *rwlp, const timespec_t *abstime)
808 {
809 	timespec_t tslocal;
810 	int error;
811 
812 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
813 	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
814 	error = rw_wrlock_impl((rwlock_t *)rwlp, &tslocal);
815 	if (error == ETIME)
816 		error = ETIMEDOUT;
817 	return (error);
818 }
819 
820 #pragma weak pthread_rwlock_tryrdlock = rw_tryrdlock
821 int
822 rw_tryrdlock(rwlock_t *rwlp)
823 {
824 	ulwp_t *self = curthread;
825 	uberdata_t *udp = self->ul_uberdata;
826 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
827 	readlock_t *readlockp;
828 	int error;
829 
830 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
831 
832 	if (rwsp)
833 		tdb_incr(rwsp->rw_rdlock_try);
834 
835 	/*
836 	 * If we already hold a readers lock on this rwlock,
837 	 * just increment our reference count and return.
838 	 */
839 	sigoff(self);
840 	readlockp = rwl_entry(rwlp);
841 	if (readlockp->rd_count != 0) {
842 		if (readlockp->rd_count == READ_LOCK_MAX) {
843 			sigon(self);
844 			error = EAGAIN;
845 			goto out;
846 		}
847 		sigon(self);
848 		error = 0;
849 		goto out;
850 	}
851 	sigon(self);
852 
853 	if (read_lock_try(rwlp, 0))
854 		error = 0;
855 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
856 		error = shared_rwlock_lock(rwlp, NULL, READ_LOCK_TRY);
857 	else						/* user-level */
858 		error = rwlock_lock(rwlp, NULL, READ_LOCK_TRY);
859 
860 out:
861 	if (error == 0) {
862 		sigoff(self);
863 		rwl_entry(rwlp)->rd_count++;
864 		sigon(self);
865 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, READ_LOCK);
866 	} else {
867 		if (rwsp)
868 			tdb_incr(rwsp->rw_rdlock_try_fail);
869 		if (error != EBUSY) {
870 			DTRACE_PROBE3(plockstat, rw__error, rwlp, READ_LOCK,
871 			    error);
872 		}
873 	}
874 
875 	return (error);
876 }
877 
878 #pragma weak pthread_rwlock_trywrlock = rw_trywrlock
879 int
880 rw_trywrlock(rwlock_t *rwlp)
881 {
882 	ulwp_t *self = curthread;
883 	uberdata_t *udp = self->ul_uberdata;
884 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
885 	int error;
886 
887 	ASSERT(!self->ul_critical || self->ul_bindflags);
888 
889 	if (rwsp)
890 		tdb_incr(rwsp->rw_wrlock_try);
891 
892 	if (write_lock_try(rwlp, 0))
893 		error = 0;
894 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
895 		error = shared_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY);
896 	else						/* user-level */
897 		error = rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY);
898 
899 	if (error == 0) {
900 		rwlp->rwlock_owner = (uintptr_t)self;
901 		if (rwlp->rwlock_type == USYNC_PROCESS)
902 			rwlp->rwlock_ownerpid = udp->pid;
903 		if (rwsp)
904 			rwsp->rw_wrlock_begin_hold = gethrtime();
905 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, WRITE_LOCK);
906 	} else {
907 		if (rwsp)
908 			tdb_incr(rwsp->rw_wrlock_try_fail);
909 		if (error != EBUSY) {
910 			DTRACE_PROBE3(plockstat, rw__error, rwlp, WRITE_LOCK,
911 			    error);
912 		}
913 	}
914 	return (error);
915 }
916 
917 #pragma weak pthread_rwlock_unlock = rw_unlock
918 #pragma weak _rw_unlock = rw_unlock
919 int
920 rw_unlock(rwlock_t *rwlp)
921 {
922 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
923 	uint32_t readers;
924 	ulwp_t *self = curthread;
925 	uberdata_t *udp = self->ul_uberdata;
926 	tdb_rwlock_stats_t *rwsp;
927 	queue_head_t *qp;
928 	int rd_wr;
929 	int waked = 0;
930 
931 	readers = *rwstate;
932 	ASSERT_CONSISTENT_STATE(readers);
933 	if (readers & URW_WRITE_LOCKED) {
934 		rd_wr = WRITE_LOCK;
935 		readers = 0;
936 	} else {
937 		rd_wr = READ_LOCK;
938 		readers &= URW_READERS_MASK;
939 	}
940 
941 	if (rd_wr == WRITE_LOCK) {
942 		/*
943 		 * Since the writer lock is held, we'd better be
944 		 * holding it, else we cannot legitimately be here.
945 		 */
946 		if (!rw_write_held(rwlp)) {
947 			if (self->ul_error_detection)
948 				rwlock_error(rwlp, "rwlock_unlock",
949 				    "writer lock held, "
950 				    "but not by the calling thread");
951 			return (EPERM);
952 		}
953 		if ((rwsp = RWLOCK_STATS(rwlp, udp)) != NULL) {
954 			if (rwsp->rw_wrlock_begin_hold)
955 				rwsp->rw_wrlock_hold_time +=
956 				    gethrtime() - rwsp->rw_wrlock_begin_hold;
957 			rwsp->rw_wrlock_begin_hold = 0;
958 		}
959 		rwlp->rwlock_owner = 0;
960 		rwlp->rwlock_ownerpid = 0;
961 	} else if (readers > 0) {
962 		/*
963 		 * A readers lock is held; if we don't hold one, bail out.
964 		 */
965 		readlock_t *readlockp;
966 
967 		sigoff(self);
968 		readlockp = rwl_entry(rwlp);
969 		if (readlockp->rd_count == 0) {
970 			sigon(self);
971 			if (self->ul_error_detection)
972 				rwlock_error(rwlp, "rwlock_unlock",
973 				    "readers lock held, "
974 				    "but not by the calling thread");
975 			return (EPERM);
976 		}
977 		/*
978 		 * If we hold more than one readers lock on this rwlock,
979 		 * just decrement our reference count and return.
980 		 */
981 		if (--readlockp->rd_count != 0) {
982 			sigon(self);
983 			goto out;
984 		}
985 		sigon(self);
986 	} else {
987 		/*
988 		 * This is a usage error.
989 		 * No thread should release an unowned lock.
990 		 */
991 		if (self->ul_error_detection)
992 			rwlock_error(rwlp, "rwlock_unlock", "lock not owned");
993 		return (EPERM);
994 	}
995 
996 	if (rd_wr == WRITE_LOCK && write_unlock_try(rwlp)) {
997 		/* EMPTY */;
998 	} else if (rd_wr == READ_LOCK && read_unlock_try(rwlp)) {
999 		/* EMPTY */;
1000 	} else if (rwlp->rwlock_type == USYNC_PROCESS) {
1001 		(void) mutex_lock(&rwlp->mutex);
1002 		(void) __lwp_rwlock_unlock(rwlp);
1003 		(void) mutex_unlock(&rwlp->mutex);
1004 		waked = 1;
1005 	} else {
1006 		qp = queue_lock(rwlp, MX);
1007 		if (rd_wr == READ_LOCK)
1008 			atomic_dec_32(rwstate);
1009 		else
1010 			atomic_and_32(rwstate, ~URW_WRITE_LOCKED);
1011 		waked = rw_queue_release(qp, rwlp);
1012 	}
1013 
1014 out:
1015 	DTRACE_PROBE2(plockstat, rw__release, rwlp, rd_wr);
1016 
1017 	/*
1018 	 * Yield to the thread we just waked up, just in case we might
1019 	 * be about to grab the rwlock again immediately upon return.
1020 	 * This is pretty weak but it helps on a uniprocessor and also
1021 	 * when cpu affinity has assigned both ourself and the other
1022 	 * thread to the same CPU.  Note that lwp_yield() will yield
1023 	 * the processor only if the writer is at the same or higher
1024 	 * priority than ourself.  This provides more balanced program
1025 	 * behavior; it doesn't guarantee acquisition of the lock by
1026 	 * the pending writer.
1027 	 */
1028 	if (waked)
1029 		yield();
1030 	return (0);
1031 }
1032 
1033 void
1034 lrw_unlock(rwlock_t *rwlp)
1035 {
1036 	(void) rw_unlock(rwlp);
1037 	exit_critical(curthread);
1038 }
1039