xref: /illumos-gate/usr/src/lib/libc/port/threads/rwlock.c (revision 5bbb4db2c3f208d12bf0fd11769728f9e5ba66a2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include "lint.h"
28 #include "thr_uberdata.h"
29 #include <sys/sdt.h>
30 
31 #define	TRY_FLAG		0x10
32 #define	READ_LOCK		0
33 #define	WRITE_LOCK		1
34 #define	READ_LOCK_TRY		(READ_LOCK | TRY_FLAG)
35 #define	WRITE_LOCK_TRY		(WRITE_LOCK | TRY_FLAG)
36 
37 #define	NLOCKS	4	/* initial number of readlock_t structs allocated */
38 
39 #define	ASSERT_CONSISTENT_STATE(readers)		\
40 	ASSERT(!((readers) & URW_WRITE_LOCKED) ||	\
41 		((readers) & ~URW_HAS_WAITERS) == URW_WRITE_LOCKED)
42 
43 /*
44  * Find/allocate an entry for rwlp in our array of rwlocks held for reading.
45  * We must be deferring signals for this to be safe.
46  * Else if we are returning an entry with ul_rdlockcnt == 0,
47  * it could be reassigned behind our back in a signal handler.
48  */
49 static readlock_t *
50 rwl_entry(rwlock_t *rwlp)
51 {
52 	ulwp_t *self = curthread;
53 	readlock_t *remembered = NULL;
54 	readlock_t *readlockp;
55 	uint_t nlocks;
56 
57 	/* we must be deferring signals */
58 	ASSERT((self->ul_critical + self->ul_sigdefer) != 0);
59 
60 	if ((nlocks = self->ul_rdlockcnt) != 0)
61 		readlockp = self->ul_readlock.array;
62 	else {
63 		nlocks = 1;
64 		readlockp = &self->ul_readlock.single;
65 	}
66 
67 	for (; nlocks; nlocks--, readlockp++) {
68 		if (readlockp->rd_rwlock == rwlp)
69 			return (readlockp);
70 		if (readlockp->rd_count == 0 && remembered == NULL)
71 			remembered = readlockp;
72 	}
73 	if (remembered != NULL) {
74 		remembered->rd_rwlock = rwlp;
75 		return (remembered);
76 	}
77 
78 	/*
79 	 * No entry available.  Allocate more space, converting the single
80 	 * readlock_t entry into an array of readlock_t entries if necessary.
81 	 */
82 	if ((nlocks = self->ul_rdlockcnt) == 0) {
83 		/*
84 		 * Initial allocation of the readlock_t array.
85 		 * Convert the single entry into an array.
86 		 */
87 		self->ul_rdlockcnt = nlocks = NLOCKS;
88 		readlockp = lmalloc(nlocks * sizeof (readlock_t));
89 		/*
90 		 * The single readlock_t becomes the first entry in the array.
91 		 */
92 		*readlockp = self->ul_readlock.single;
93 		self->ul_readlock.single.rd_count = 0;
94 		self->ul_readlock.array = readlockp;
95 		/*
96 		 * Return the next available entry in the array.
97 		 */
98 		(++readlockp)->rd_rwlock = rwlp;
99 		return (readlockp);
100 	}
101 	/*
102 	 * Reallocate the array, double the size each time.
103 	 */
104 	readlockp = lmalloc(nlocks * 2 * sizeof (readlock_t));
105 	(void) memcpy(readlockp, self->ul_readlock.array,
106 	    nlocks * sizeof (readlock_t));
107 	lfree(self->ul_readlock.array, nlocks * sizeof (readlock_t));
108 	self->ul_readlock.array = readlockp;
109 	self->ul_rdlockcnt *= 2;
110 	/*
111 	 * Return the next available entry in the newly allocated array.
112 	 */
113 	(readlockp += nlocks)->rd_rwlock = rwlp;
114 	return (readlockp);
115 }
116 
117 /*
118  * Free the array of rwlocks held for reading.
119  */
120 void
121 rwl_free(ulwp_t *ulwp)
122 {
123 	uint_t nlocks;
124 
125 	if ((nlocks = ulwp->ul_rdlockcnt) != 0)
126 		lfree(ulwp->ul_readlock.array, nlocks * sizeof (readlock_t));
127 	ulwp->ul_rdlockcnt = 0;
128 	ulwp->ul_readlock.single.rd_rwlock = NULL;
129 	ulwp->ul_readlock.single.rd_count = 0;
130 }
131 
132 /*
133  * Check if a reader version of the lock is held by the current thread.
134  */
135 #pragma weak _rw_read_held = rw_read_held
136 int
137 rw_read_held(rwlock_t *rwlp)
138 {
139 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
140 	uint32_t readers;
141 	ulwp_t *self = curthread;
142 	readlock_t *readlockp;
143 	uint_t nlocks;
144 	int rval = 0;
145 
146 	no_preempt(self);
147 
148 	readers = *rwstate;
149 	ASSERT_CONSISTENT_STATE(readers);
150 	if (!(readers & URW_WRITE_LOCKED) &&
151 	    (readers & URW_READERS_MASK) != 0) {
152 		/*
153 		 * The lock is held for reading by some thread.
154 		 * Search our array of rwlocks held for reading for a match.
155 		 */
156 		if ((nlocks = self->ul_rdlockcnt) != 0)
157 			readlockp = self->ul_readlock.array;
158 		else {
159 			nlocks = 1;
160 			readlockp = &self->ul_readlock.single;
161 		}
162 		for (; nlocks; nlocks--, readlockp++) {
163 			if (readlockp->rd_rwlock == rwlp) {
164 				if (readlockp->rd_count)
165 					rval = 1;
166 				break;
167 			}
168 		}
169 	}
170 
171 	preempt(self);
172 	return (rval);
173 }
174 
175 /*
176  * Check if a writer version of the lock is held by the current thread.
177  */
178 #pragma weak _rw_write_held = rw_write_held
179 int
180 rw_write_held(rwlock_t *rwlp)
181 {
182 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
183 	uint32_t readers;
184 	ulwp_t *self = curthread;
185 	int rval;
186 
187 	no_preempt(self);
188 
189 	readers = *rwstate;
190 	ASSERT_CONSISTENT_STATE(readers);
191 	rval = ((readers & URW_WRITE_LOCKED) &&
192 	    rwlp->rwlock_owner == (uintptr_t)self &&
193 	    (rwlp->rwlock_type == USYNC_THREAD ||
194 	    rwlp->rwlock_ownerpid == self->ul_uberdata->pid));
195 
196 	preempt(self);
197 	return (rval);
198 }
199 
200 #pragma weak _rwlock_init = rwlock_init
201 /* ARGSUSED2 */
202 int
203 rwlock_init(rwlock_t *rwlp, int type, void *arg)
204 {
205 	ulwp_t *self = curthread;
206 
207 	if (type != USYNC_THREAD && type != USYNC_PROCESS)
208 		return (EINVAL);
209 	/*
210 	 * Once reinitialized, we can no longer be holding a read or write lock.
211 	 * We can do nothing about other threads that are holding read locks.
212 	 */
213 	sigoff(self);
214 	rwl_entry(rwlp)->rd_count = 0;
215 	sigon(self);
216 	(void) memset(rwlp, 0, sizeof (*rwlp));
217 	rwlp->rwlock_type = (uint16_t)type;
218 	rwlp->rwlock_magic = RWL_MAGIC;
219 	rwlp->mutex.mutex_type = (uint8_t)type;
220 	rwlp->mutex.mutex_flag = LOCK_INITED;
221 	rwlp->mutex.mutex_magic = MUTEX_MAGIC;
222 
223 	/*
224 	 * This should be at the beginning of the function,
225 	 * but for the sake of old broken applications that
226 	 * do not have proper alignment for their rwlocks
227 	 * (and don't check the return code from rwlock_init),
228 	 * we put it here, after initializing the rwlock regardless.
229 	 */
230 	if (((uintptr_t)rwlp & (_LONG_LONG_ALIGNMENT - 1)) &&
231 	    self->ul_misaligned == 0)
232 		return (EINVAL);
233 
234 	return (0);
235 }
236 
237 #pragma weak pthread_rwlock_destroy = rwlock_destroy
238 #pragma weak _rwlock_destroy = rwlock_destroy
239 int
240 rwlock_destroy(rwlock_t *rwlp)
241 {
242 	ulwp_t *self = curthread;
243 
244 	/*
245 	 * Once destroyed, we can no longer be holding a read or write lock.
246 	 * We can do nothing about other threads that are holding read locks.
247 	 */
248 	sigoff(self);
249 	rwl_entry(rwlp)->rd_count = 0;
250 	sigon(self);
251 	rwlp->rwlock_magic = 0;
252 	tdb_sync_obj_deregister(rwlp);
253 	return (0);
254 }
255 
256 /*
257  * Attempt to acquire a readers lock.  Return true on success.
258  */
259 static int
260 read_lock_try(rwlock_t *rwlp, int ignore_waiters_flag)
261 {
262 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
263 	uint32_t mask = ignore_waiters_flag?
264 	    URW_WRITE_LOCKED : (URW_HAS_WAITERS | URW_WRITE_LOCKED);
265 	uint32_t readers;
266 	ulwp_t *self = curthread;
267 
268 	no_preempt(self);
269 	while (((readers = *rwstate) & mask) == 0) {
270 		if (atomic_cas_32(rwstate, readers, readers + 1) == readers) {
271 			preempt(self);
272 			return (1);
273 		}
274 	}
275 	preempt(self);
276 	return (0);
277 }
278 
279 /*
280  * Attempt to release a reader lock.  Return true on success.
281  */
282 static int
283 read_unlock_try(rwlock_t *rwlp)
284 {
285 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
286 	uint32_t readers;
287 	ulwp_t *self = curthread;
288 
289 	no_preempt(self);
290 	while (((readers = *rwstate) & URW_HAS_WAITERS) == 0) {
291 		if (atomic_cas_32(rwstate, readers, readers - 1) == readers) {
292 			preempt(self);
293 			return (1);
294 		}
295 	}
296 	preempt(self);
297 	return (0);
298 }
299 
300 /*
301  * Attempt to acquire a writer lock.  Return true on success.
302  */
303 static int
304 write_lock_try(rwlock_t *rwlp, int ignore_waiters_flag)
305 {
306 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
307 	uint32_t mask = ignore_waiters_flag?
308 	    (URW_WRITE_LOCKED | URW_READERS_MASK) :
309 	    (URW_HAS_WAITERS | URW_WRITE_LOCKED | URW_READERS_MASK);
310 	ulwp_t *self = curthread;
311 	uint32_t readers;
312 
313 	no_preempt(self);
314 	while (((readers = *rwstate) & mask) == 0) {
315 		if (atomic_cas_32(rwstate, readers, readers | URW_WRITE_LOCKED)
316 		    == readers) {
317 			preempt(self);
318 			return (1);
319 		}
320 	}
321 	preempt(self);
322 	return (0);
323 }
324 
325 /*
326  * Attempt to release a writer lock.  Return true on success.
327  */
328 static int
329 write_unlock_try(rwlock_t *rwlp)
330 {
331 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
332 	uint32_t readers;
333 	ulwp_t *self = curthread;
334 
335 	no_preempt(self);
336 	while (((readers = *rwstate) & URW_HAS_WAITERS) == 0) {
337 		if (atomic_cas_32(rwstate, readers, 0) == readers) {
338 			preempt(self);
339 			return (1);
340 		}
341 	}
342 	preempt(self);
343 	return (0);
344 }
345 
346 /*
347  * Wake up thread(s) sleeping on the rwlock queue and then
348  * drop the queue lock.  Return non-zero if we wake up someone.
349  * This is called when a thread releases a lock that appears to have waiters.
350  */
351 static int
352 rw_queue_release(queue_head_t *qp, rwlock_t *rwlp)
353 {
354 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
355 	uint32_t readers;
356 	uint32_t writers;
357 	ulwp_t **ulwpp;
358 	ulwp_t *ulwp;
359 	ulwp_t *prev;
360 	int nlwpid = 0;
361 	int more;
362 	int maxlwps = MAXLWPS;
363 	lwpid_t buffer[MAXLWPS];
364 	lwpid_t *lwpid = buffer;
365 
366 	readers = *rwstate;
367 	ASSERT_CONSISTENT_STATE(readers);
368 	if (!(readers & URW_HAS_WAITERS)) {
369 		queue_unlock(qp);
370 		return (0);
371 	}
372 	readers &= URW_READERS_MASK;
373 	writers = 0;
374 
375 	/*
376 	 * Examine the queue of waiters in priority order and prepare
377 	 * to wake up as many readers as we encounter before encountering
378 	 * a writer.  If the highest priority thread on the queue is a
379 	 * writer, stop there and wake it up.
380 	 *
381 	 * We keep track of lwpids that are to be unparked in lwpid[].
382 	 * __lwp_unpark_all() is called to unpark all of them after
383 	 * they have been removed from the sleep queue and the sleep
384 	 * queue lock has been dropped.  If we run out of space in our
385 	 * on-stack buffer, we need to allocate more but we can't call
386 	 * lmalloc() because we are holding a queue lock when the overflow
387 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
388 	 * either because the application may have allocated a small
389 	 * stack and we don't want to overrun the stack.  So we call
390 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
391 	 * system call directly since that path acquires no locks.
392 	 */
393 	while ((ulwpp = queue_slot(qp, &prev, &more)) != NULL) {
394 		ulwp = *ulwpp;
395 		ASSERT(ulwp->ul_wchan == rwlp);
396 		if (ulwp->ul_writer) {
397 			if (writers != 0 || readers != 0)
398 				break;
399 			/* one writer to wake */
400 			writers++;
401 		} else {
402 			if (writers != 0)
403 				break;
404 			/* at least one reader to wake */
405 			readers++;
406 			if (nlwpid == maxlwps)
407 				lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
408 		}
409 		queue_unlink(qp, ulwpp, prev);
410 		ulwp->ul_sleepq = NULL;
411 		ulwp->ul_wchan = NULL;
412 		lwpid[nlwpid++] = ulwp->ul_lwpid;
413 	}
414 	if (ulwpp == NULL)
415 		atomic_and_32(rwstate, ~URW_HAS_WAITERS);
416 	if (nlwpid == 0) {
417 		queue_unlock(qp);
418 	} else {
419 		ulwp_t *self = curthread;
420 		no_preempt(self);
421 		queue_unlock(qp);
422 		if (nlwpid == 1)
423 			(void) __lwp_unpark(lwpid[0]);
424 		else
425 			(void) __lwp_unpark_all(lwpid, nlwpid);
426 		preempt(self);
427 	}
428 	if (lwpid != buffer)
429 		(void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
430 	return (nlwpid != 0);
431 }
432 
433 /*
434  * Common code for rdlock, timedrdlock, wrlock, timedwrlock, tryrdlock,
435  * and trywrlock for process-shared (USYNC_PROCESS) rwlocks.
436  *
437  * Note: if the lock appears to be contended we call __lwp_rwlock_rdlock()
438  * or __lwp_rwlock_wrlock() holding the mutex. These return with the mutex
439  * released, and if they need to sleep will release the mutex first. In the
440  * event of a spurious wakeup, these will return EAGAIN (because it is much
441  * easier for us to re-acquire the mutex here).
442  */
443 int
444 shared_rwlock_lock(rwlock_t *rwlp, timespec_t *tsp, int rd_wr)
445 {
446 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
447 	mutex_t *mp = &rwlp->mutex;
448 	uint32_t readers;
449 	int try_flag;
450 	int error;
451 
452 	try_flag = (rd_wr & TRY_FLAG);
453 	rd_wr &= ~TRY_FLAG;
454 	ASSERT(rd_wr == READ_LOCK || rd_wr == WRITE_LOCK);
455 
456 	if (!try_flag) {
457 		DTRACE_PROBE2(plockstat, rw__block, rwlp, rd_wr);
458 	}
459 
460 	do {
461 		if (try_flag && (*rwstate & URW_WRITE_LOCKED)) {
462 			error = EBUSY;
463 			break;
464 		}
465 		if ((error = mutex_lock(mp)) != 0)
466 			break;
467 		if (rd_wr == READ_LOCK) {
468 			if (read_lock_try(rwlp, 0)) {
469 				(void) mutex_unlock(mp);
470 				break;
471 			}
472 		} else {
473 			if (write_lock_try(rwlp, 0)) {
474 				(void) mutex_unlock(mp);
475 				break;
476 			}
477 		}
478 		atomic_or_32(rwstate, URW_HAS_WAITERS);
479 		readers = *rwstate;
480 		ASSERT_CONSISTENT_STATE(readers);
481 		/*
482 		 * The calls to __lwp_rwlock_*() below will release the mutex,
483 		 * so we need a dtrace probe here.  The owner field of the
484 		 * mutex is cleared in the kernel when the mutex is released,
485 		 * so we should not clear it here.
486 		 */
487 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
488 		/*
489 		 * The waiters bit may be inaccurate.
490 		 * Only the kernel knows for sure.
491 		 */
492 		if (rd_wr == READ_LOCK) {
493 			if (try_flag)
494 				error = __lwp_rwlock_tryrdlock(rwlp);
495 			else
496 				error = __lwp_rwlock_rdlock(rwlp, tsp);
497 		} else {
498 			if (try_flag)
499 				error = __lwp_rwlock_trywrlock(rwlp);
500 			else
501 				error = __lwp_rwlock_wrlock(rwlp, tsp);
502 		}
503 	} while (error == EAGAIN || error == EINTR);
504 
505 	if (!try_flag) {
506 		DTRACE_PROBE3(plockstat, rw__blocked, rwlp, rd_wr, error == 0);
507 	}
508 
509 	return (error);
510 }
511 
512 /*
513  * Common code for rdlock, timedrdlock, wrlock, timedwrlock, tryrdlock,
514  * and trywrlock for process-private (USYNC_THREAD) rwlocks.
515  */
516 int
517 rwlock_lock(rwlock_t *rwlp, timespec_t *tsp, int rd_wr)
518 {
519 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
520 	uint32_t readers;
521 	ulwp_t *self = curthread;
522 	queue_head_t *qp;
523 	ulwp_t *ulwp;
524 	int try_flag;
525 	int ignore_waiters_flag;
526 	int error = 0;
527 
528 	try_flag = (rd_wr & TRY_FLAG);
529 	rd_wr &= ~TRY_FLAG;
530 	ASSERT(rd_wr == READ_LOCK || rd_wr == WRITE_LOCK);
531 
532 	if (!try_flag) {
533 		DTRACE_PROBE2(plockstat, rw__block, rwlp, rd_wr);
534 	}
535 
536 	qp = queue_lock(rwlp, MX);
537 	/* initial attempt to acquire the lock fails if there are waiters */
538 	ignore_waiters_flag = 0;
539 	while (error == 0) {
540 		if (rd_wr == READ_LOCK) {
541 			if (read_lock_try(rwlp, ignore_waiters_flag))
542 				break;
543 		} else {
544 			if (write_lock_try(rwlp, ignore_waiters_flag))
545 				break;
546 		}
547 		/* subsequent attempts do not fail due to waiters */
548 		ignore_waiters_flag = 1;
549 		atomic_or_32(rwstate, URW_HAS_WAITERS);
550 		readers = *rwstate;
551 		ASSERT_CONSISTENT_STATE(readers);
552 		if ((readers & URW_WRITE_LOCKED) ||
553 		    (rd_wr == WRITE_LOCK &&
554 		    (readers & URW_READERS_MASK) != 0))
555 			/* EMPTY */;	/* somebody holds the lock */
556 		else if ((ulwp = queue_waiter(qp)) == NULL) {
557 			atomic_and_32(rwstate, ~URW_HAS_WAITERS);
558 			continue;	/* no queued waiters, try again */
559 		} else {
560 			/*
561 			 * Do a priority check on the queued waiter (the
562 			 * highest priority thread on the queue) to see
563 			 * if we should defer to him or just grab the lock.
564 			 */
565 			int our_pri = real_priority(self);
566 			int his_pri = real_priority(ulwp);
567 
568 			if (rd_wr == WRITE_LOCK) {
569 				/*
570 				 * We defer to a queued thread that has
571 				 * a higher priority than ours.
572 				 */
573 				if (his_pri <= our_pri)
574 					continue;	/* try again */
575 			} else {
576 				/*
577 				 * We defer to a queued thread that has
578 				 * a higher priority than ours or that
579 				 * is a writer whose priority equals ours.
580 				 */
581 				if (his_pri < our_pri ||
582 				    (his_pri == our_pri && !ulwp->ul_writer))
583 					continue;	/* try again */
584 			}
585 		}
586 		/*
587 		 * We are about to block.
588 		 * If we're doing a trylock, return EBUSY instead.
589 		 */
590 		if (try_flag) {
591 			error = EBUSY;
592 			break;
593 		}
594 		/*
595 		 * Enqueue writers ahead of readers.
596 		 */
597 		self->ul_writer = rd_wr;	/* *must* be 0 or 1 */
598 		enqueue(qp, self, 0);
599 		set_parking_flag(self, 1);
600 		queue_unlock(qp);
601 		if ((error = __lwp_park(tsp, 0)) == EINTR)
602 			error = ignore_waiters_flag = 0;
603 		set_parking_flag(self, 0);
604 		qp = queue_lock(rwlp, MX);
605 		if (self->ul_sleepq && dequeue_self(qp) == 0)
606 			atomic_and_32(rwstate, ~URW_HAS_WAITERS);
607 		self->ul_writer = 0;
608 	}
609 
610 	queue_unlock(qp);
611 
612 	if (!try_flag) {
613 		DTRACE_PROBE3(plockstat, rw__blocked, rwlp, rd_wr, error == 0);
614 	}
615 
616 	return (error);
617 }
618 
619 int
620 rw_rdlock_impl(rwlock_t *rwlp, timespec_t *tsp)
621 {
622 	ulwp_t *self = curthread;
623 	uberdata_t *udp = self->ul_uberdata;
624 	readlock_t *readlockp;
625 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
626 	int error;
627 
628 	/*
629 	 * If we already hold a readers lock on this rwlock,
630 	 * just increment our reference count and return.
631 	 */
632 	sigoff(self);
633 	readlockp = rwl_entry(rwlp);
634 	if (readlockp->rd_count != 0) {
635 		if (readlockp->rd_count == READ_LOCK_MAX) {
636 			sigon(self);
637 			error = EAGAIN;
638 			goto out;
639 		}
640 		sigon(self);
641 		error = 0;
642 		goto out;
643 	}
644 	sigon(self);
645 
646 	/*
647 	 * If we hold the writer lock, bail out.
648 	 */
649 	if (rw_write_held(rwlp)) {
650 		if (self->ul_error_detection)
651 			rwlock_error(rwlp, "rwlock_rdlock",
652 			    "calling thread owns the writer lock");
653 		error = EDEADLK;
654 		goto out;
655 	}
656 
657 	if (read_lock_try(rwlp, 0))
658 		error = 0;
659 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
660 		error = shared_rwlock_lock(rwlp, tsp, READ_LOCK);
661 	else						/* user-level */
662 		error = rwlock_lock(rwlp, tsp, READ_LOCK);
663 
664 out:
665 	if (error == 0) {
666 		sigoff(self);
667 		rwl_entry(rwlp)->rd_count++;
668 		sigon(self);
669 		if (rwsp)
670 			tdb_incr(rwsp->rw_rdlock);
671 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, READ_LOCK);
672 	} else {
673 		DTRACE_PROBE3(plockstat, rw__error, rwlp, READ_LOCK, error);
674 	}
675 
676 	return (error);
677 }
678 
679 #pragma weak pthread_rwlock_rdlock = rw_rdlock
680 #pragma weak _rw_rdlock = rw_rdlock
681 int
682 rw_rdlock(rwlock_t *rwlp)
683 {
684 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
685 	return (rw_rdlock_impl(rwlp, NULL));
686 }
687 
688 void
689 lrw_rdlock(rwlock_t *rwlp)
690 {
691 	enter_critical(curthread);
692 	(void) rw_rdlock_impl(rwlp, NULL);
693 }
694 
695 int
696 pthread_rwlock_reltimedrdlock_np(pthread_rwlock_t *_RESTRICT_KYWD rwlp,
697     const struct timespec *_RESTRICT_KYWD reltime)
698 {
699 	timespec_t tslocal = *reltime;
700 	int error;
701 
702 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
703 	error = rw_rdlock_impl((rwlock_t *)rwlp, &tslocal);
704 	if (error == ETIME)
705 		error = ETIMEDOUT;
706 	return (error);
707 }
708 
709 int
710 pthread_rwlock_timedrdlock(pthread_rwlock_t *_RESTRICT_KYWD rwlp,
711     const struct timespec *_RESTRICT_KYWD abstime)
712 {
713 	timespec_t tslocal;
714 	int error;
715 
716 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
717 	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
718 	error = rw_rdlock_impl((rwlock_t *)rwlp, &tslocal);
719 	if (error == ETIME)
720 		error = ETIMEDOUT;
721 	return (error);
722 }
723 
724 int
725 rw_wrlock_impl(rwlock_t *rwlp, timespec_t *tsp)
726 {
727 	ulwp_t *self = curthread;
728 	uberdata_t *udp = self->ul_uberdata;
729 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
730 	int error;
731 
732 	/*
733 	 * If we hold a readers lock on this rwlock, bail out.
734 	 */
735 	if (rw_read_held(rwlp)) {
736 		if (self->ul_error_detection)
737 			rwlock_error(rwlp, "rwlock_wrlock",
738 			    "calling thread owns the readers lock");
739 		error = EDEADLK;
740 		goto out;
741 	}
742 
743 	/*
744 	 * If we hold the writer lock, bail out.
745 	 */
746 	if (rw_write_held(rwlp)) {
747 		if (self->ul_error_detection)
748 			rwlock_error(rwlp, "rwlock_wrlock",
749 			    "calling thread owns the writer lock");
750 		error = EDEADLK;
751 		goto out;
752 	}
753 
754 	if (write_lock_try(rwlp, 0))
755 		error = 0;
756 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
757 		error = shared_rwlock_lock(rwlp, tsp, WRITE_LOCK);
758 	else						/* user-level */
759 		error = rwlock_lock(rwlp, tsp, WRITE_LOCK);
760 
761 out:
762 	if (error == 0) {
763 		rwlp->rwlock_owner = (uintptr_t)self;
764 		if (rwlp->rwlock_type == USYNC_PROCESS)
765 			rwlp->rwlock_ownerpid = udp->pid;
766 		if (rwsp) {
767 			tdb_incr(rwsp->rw_wrlock);
768 			rwsp->rw_wrlock_begin_hold = gethrtime();
769 		}
770 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, WRITE_LOCK);
771 	} else {
772 		DTRACE_PROBE3(plockstat, rw__error, rwlp, WRITE_LOCK, error);
773 	}
774 	return (error);
775 }
776 
777 #pragma weak pthread_rwlock_wrlock = rw_wrlock
778 #pragma weak _rw_wrlock = rw_wrlock
779 int
780 rw_wrlock(rwlock_t *rwlp)
781 {
782 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
783 	return (rw_wrlock_impl(rwlp, NULL));
784 }
785 
786 void
787 lrw_wrlock(rwlock_t *rwlp)
788 {
789 	enter_critical(curthread);
790 	(void) rw_wrlock_impl(rwlp, NULL);
791 }
792 
793 int
794 pthread_rwlock_reltimedwrlock_np(pthread_rwlock_t *_RESTRICT_KYWD rwlp,
795     const struct timespec *_RESTRICT_KYWD reltime)
796 {
797 	timespec_t tslocal = *reltime;
798 	int error;
799 
800 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
801 	error = rw_wrlock_impl((rwlock_t *)rwlp, &tslocal);
802 	if (error == ETIME)
803 		error = ETIMEDOUT;
804 	return (error);
805 }
806 
807 int
808 pthread_rwlock_timedwrlock(pthread_rwlock_t *rwlp, const timespec_t *abstime)
809 {
810 	timespec_t tslocal;
811 	int error;
812 
813 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
814 	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
815 	error = rw_wrlock_impl((rwlock_t *)rwlp, &tslocal);
816 	if (error == ETIME)
817 		error = ETIMEDOUT;
818 	return (error);
819 }
820 
821 #pragma weak pthread_rwlock_tryrdlock = rw_tryrdlock
822 int
823 rw_tryrdlock(rwlock_t *rwlp)
824 {
825 	ulwp_t *self = curthread;
826 	uberdata_t *udp = self->ul_uberdata;
827 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
828 	readlock_t *readlockp;
829 	int error;
830 
831 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
832 
833 	if (rwsp)
834 		tdb_incr(rwsp->rw_rdlock_try);
835 
836 	/*
837 	 * If we already hold a readers lock on this rwlock,
838 	 * just increment our reference count and return.
839 	 */
840 	sigoff(self);
841 	readlockp = rwl_entry(rwlp);
842 	if (readlockp->rd_count != 0) {
843 		if (readlockp->rd_count == READ_LOCK_MAX) {
844 			sigon(self);
845 			error = EAGAIN;
846 			goto out;
847 		}
848 		sigon(self);
849 		error = 0;
850 		goto out;
851 	}
852 	sigon(self);
853 
854 	if (read_lock_try(rwlp, 0))
855 		error = 0;
856 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
857 		error = shared_rwlock_lock(rwlp, NULL, READ_LOCK_TRY);
858 	else						/* user-level */
859 		error = rwlock_lock(rwlp, NULL, READ_LOCK_TRY);
860 
861 out:
862 	if (error == 0) {
863 		sigoff(self);
864 		rwl_entry(rwlp)->rd_count++;
865 		sigon(self);
866 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, READ_LOCK);
867 	} else {
868 		if (rwsp)
869 			tdb_incr(rwsp->rw_rdlock_try_fail);
870 		if (error != EBUSY) {
871 			DTRACE_PROBE3(plockstat, rw__error, rwlp, READ_LOCK,
872 			    error);
873 		}
874 	}
875 
876 	return (error);
877 }
878 
879 #pragma weak pthread_rwlock_trywrlock = rw_trywrlock
880 int
881 rw_trywrlock(rwlock_t *rwlp)
882 {
883 	ulwp_t *self = curthread;
884 	uberdata_t *udp = self->ul_uberdata;
885 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
886 	int error;
887 
888 	ASSERT(!self->ul_critical || self->ul_bindflags);
889 
890 	if (rwsp)
891 		tdb_incr(rwsp->rw_wrlock_try);
892 
893 	if (write_lock_try(rwlp, 0))
894 		error = 0;
895 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
896 		error = shared_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY);
897 	else						/* user-level */
898 		error = rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY);
899 
900 	if (error == 0) {
901 		rwlp->rwlock_owner = (uintptr_t)self;
902 		if (rwlp->rwlock_type == USYNC_PROCESS)
903 			rwlp->rwlock_ownerpid = udp->pid;
904 		if (rwsp)
905 			rwsp->rw_wrlock_begin_hold = gethrtime();
906 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, WRITE_LOCK);
907 	} else {
908 		if (rwsp)
909 			tdb_incr(rwsp->rw_wrlock_try_fail);
910 		if (error != EBUSY) {
911 			DTRACE_PROBE3(plockstat, rw__error, rwlp, WRITE_LOCK,
912 			    error);
913 		}
914 	}
915 	return (error);
916 }
917 
918 #pragma weak pthread_rwlock_unlock = rw_unlock
919 #pragma weak _rw_unlock = rw_unlock
920 int
921 rw_unlock(rwlock_t *rwlp)
922 {
923 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
924 	uint32_t readers;
925 	ulwp_t *self = curthread;
926 	uberdata_t *udp = self->ul_uberdata;
927 	tdb_rwlock_stats_t *rwsp;
928 	queue_head_t *qp;
929 	int rd_wr;
930 	int waked = 0;
931 
932 	readers = *rwstate;
933 	ASSERT_CONSISTENT_STATE(readers);
934 	if (readers & URW_WRITE_LOCKED) {
935 		rd_wr = WRITE_LOCK;
936 		readers = 0;
937 	} else {
938 		rd_wr = READ_LOCK;
939 		readers &= URW_READERS_MASK;
940 	}
941 
942 	if (rd_wr == WRITE_LOCK) {
943 		/*
944 		 * Since the writer lock is held, we'd better be
945 		 * holding it, else we cannot legitimately be here.
946 		 */
947 		if (!rw_write_held(rwlp)) {
948 			if (self->ul_error_detection)
949 				rwlock_error(rwlp, "rwlock_unlock",
950 				    "writer lock held, "
951 				    "but not by the calling thread");
952 			return (EPERM);
953 		}
954 		if ((rwsp = RWLOCK_STATS(rwlp, udp)) != NULL) {
955 			if (rwsp->rw_wrlock_begin_hold)
956 				rwsp->rw_wrlock_hold_time +=
957 				    gethrtime() - rwsp->rw_wrlock_begin_hold;
958 			rwsp->rw_wrlock_begin_hold = 0;
959 		}
960 		rwlp->rwlock_owner = 0;
961 		rwlp->rwlock_ownerpid = 0;
962 	} else if (readers > 0) {
963 		/*
964 		 * A readers lock is held; if we don't hold one, bail out.
965 		 */
966 		readlock_t *readlockp;
967 
968 		sigoff(self);
969 		readlockp = rwl_entry(rwlp);
970 		if (readlockp->rd_count == 0) {
971 			sigon(self);
972 			if (self->ul_error_detection)
973 				rwlock_error(rwlp, "rwlock_unlock",
974 				    "readers lock held, "
975 				    "but not by the calling thread");
976 			return (EPERM);
977 		}
978 		/*
979 		 * If we hold more than one readers lock on this rwlock,
980 		 * just decrement our reference count and return.
981 		 */
982 		if (--readlockp->rd_count != 0) {
983 			sigon(self);
984 			goto out;
985 		}
986 		sigon(self);
987 	} else {
988 		/*
989 		 * This is a usage error.
990 		 * No thread should release an unowned lock.
991 		 */
992 		if (self->ul_error_detection)
993 			rwlock_error(rwlp, "rwlock_unlock", "lock not owned");
994 		return (EPERM);
995 	}
996 
997 	if (rd_wr == WRITE_LOCK && write_unlock_try(rwlp)) {
998 		/* EMPTY */;
999 	} else if (rd_wr == READ_LOCK && read_unlock_try(rwlp)) {
1000 		/* EMPTY */;
1001 	} else if (rwlp->rwlock_type == USYNC_PROCESS) {
1002 		(void) mutex_lock(&rwlp->mutex);
1003 		(void) __lwp_rwlock_unlock(rwlp);
1004 		(void) mutex_unlock(&rwlp->mutex);
1005 		waked = 1;
1006 	} else {
1007 		qp = queue_lock(rwlp, MX);
1008 		if (rd_wr == READ_LOCK)
1009 			atomic_dec_32(rwstate);
1010 		else
1011 			atomic_and_32(rwstate, ~URW_WRITE_LOCKED);
1012 		waked = rw_queue_release(qp, rwlp);
1013 	}
1014 
1015 out:
1016 	DTRACE_PROBE2(plockstat, rw__release, rwlp, rd_wr);
1017 
1018 	/*
1019 	 * Yield to the thread we just waked up, just in case we might
1020 	 * be about to grab the rwlock again immediately upon return.
1021 	 * This is pretty weak but it helps on a uniprocessor and also
1022 	 * when cpu affinity has assigned both ourself and the other
1023 	 * thread to the same CPU.  Note that lwp_yield() will yield
1024 	 * the processor only if the writer is at the same or higher
1025 	 * priority than ourself.  This provides more balanced program
1026 	 * behavior; it doesn't guarantee acquisition of the lock by
1027 	 * the pending writer.
1028 	 */
1029 	if (waked)
1030 		yield();
1031 	return (0);
1032 }
1033 
1034 void
1035 lrw_unlock(rwlock_t *rwlp)
1036 {
1037 	(void) rw_unlock(rwlp);
1038 	exit_critical(curthread);
1039 }
1040