xref: /illumos-gate/usr/src/lib/libc/port/threads/rwlock.c (revision 3afe87ebb25691cb6d158edaa34a6fb9b703a691)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include "lint.h"
28 #include "thr_uberdata.h"
29 #include <sys/sdt.h>
30 
31 #define	TRY_FLAG		0x10
32 #define	READ_LOCK		0
33 #define	WRITE_LOCK		1
34 #define	READ_LOCK_TRY		(READ_LOCK | TRY_FLAG)
35 #define	WRITE_LOCK_TRY		(WRITE_LOCK | TRY_FLAG)
36 
37 #define	NLOCKS	4	/* initial number of readlock_t structs allocated */
38 
39 #define	ASSERT_CONSISTENT_STATE(readers)		\
40 	ASSERT(!((readers) & URW_WRITE_LOCKED) ||	\
41 		((readers) & ~URW_HAS_WAITERS) == URW_WRITE_LOCKED)
42 
43 /*
44  * Find/allocate an entry for rwlp in our array of rwlocks held for reading.
45  * We must be deferring signals for this to be safe.
46  * Else if we are returning an entry with ul_rdlockcnt == 0,
47  * it could be reassigned behind our back in a signal handler.
48  */
49 static readlock_t *
50 rwl_entry(rwlock_t *rwlp)
51 {
52 	ulwp_t *self = curthread;
53 	readlock_t *remembered = NULL;
54 	readlock_t *readlockp;
55 	uint_t nlocks;
56 
57 	/* we must be deferring signals */
58 	ASSERT((self->ul_critical + self->ul_sigdefer) != 0);
59 
60 	if ((nlocks = self->ul_rdlockcnt) != 0)
61 		readlockp = self->ul_readlock.array;
62 	else {
63 		nlocks = 1;
64 		readlockp = &self->ul_readlock.single;
65 	}
66 
67 	for (; nlocks; nlocks--, readlockp++) {
68 		if (readlockp->rd_rwlock == rwlp)
69 			return (readlockp);
70 		if (readlockp->rd_count == 0 && remembered == NULL)
71 			remembered = readlockp;
72 	}
73 	if (remembered != NULL) {
74 		remembered->rd_rwlock = rwlp;
75 		return (remembered);
76 	}
77 
78 	/*
79 	 * No entry available.  Allocate more space, converting the single
80 	 * readlock_t entry into an array of readlock_t entries if necessary.
81 	 */
82 	if ((nlocks = self->ul_rdlockcnt) == 0) {
83 		/*
84 		 * Initial allocation of the readlock_t array.
85 		 * Convert the single entry into an array.
86 		 */
87 		self->ul_rdlockcnt = nlocks = NLOCKS;
88 		readlockp = lmalloc(nlocks * sizeof (readlock_t));
89 		/*
90 		 * The single readlock_t becomes the first entry in the array.
91 		 */
92 		*readlockp = self->ul_readlock.single;
93 		self->ul_readlock.single.rd_count = 0;
94 		self->ul_readlock.array = readlockp;
95 		/*
96 		 * Return the next available entry in the array.
97 		 */
98 		(++readlockp)->rd_rwlock = rwlp;
99 		return (readlockp);
100 	}
101 	/*
102 	 * Reallocate the array, double the size each time.
103 	 */
104 	readlockp = lmalloc(nlocks * 2 * sizeof (readlock_t));
105 	(void) memcpy(readlockp, self->ul_readlock.array,
106 	    nlocks * sizeof (readlock_t));
107 	lfree(self->ul_readlock.array, nlocks * sizeof (readlock_t));
108 	self->ul_readlock.array = readlockp;
109 	self->ul_rdlockcnt *= 2;
110 	/*
111 	 * Return the next available entry in the newly allocated array.
112 	 */
113 	(readlockp += nlocks)->rd_rwlock = rwlp;
114 	return (readlockp);
115 }
116 
117 /*
118  * Free the array of rwlocks held for reading.
119  */
120 void
121 rwl_free(ulwp_t *ulwp)
122 {
123 	uint_t nlocks;
124 
125 	if ((nlocks = ulwp->ul_rdlockcnt) != 0)
126 		lfree(ulwp->ul_readlock.array, nlocks * sizeof (readlock_t));
127 	ulwp->ul_rdlockcnt = 0;
128 	ulwp->ul_readlock.single.rd_rwlock = NULL;
129 	ulwp->ul_readlock.single.rd_count = 0;
130 }
131 
132 /*
133  * Check if a reader version of the lock is held by the current thread.
134  */
135 #pragma weak _rw_read_held = rw_read_held
136 int
137 rw_read_held(rwlock_t *rwlp)
138 {
139 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
140 	uint32_t readers;
141 	ulwp_t *self = curthread;
142 	readlock_t *readlockp;
143 	uint_t nlocks;
144 	int rval = 0;
145 
146 	no_preempt(self);
147 
148 	readers = *rwstate;
149 	ASSERT_CONSISTENT_STATE(readers);
150 	if (!(readers & URW_WRITE_LOCKED) &&
151 	    (readers & URW_READERS_MASK) != 0) {
152 		/*
153 		 * The lock is held for reading by some thread.
154 		 * Search our array of rwlocks held for reading for a match.
155 		 */
156 		if ((nlocks = self->ul_rdlockcnt) != 0)
157 			readlockp = self->ul_readlock.array;
158 		else {
159 			nlocks = 1;
160 			readlockp = &self->ul_readlock.single;
161 		}
162 		for (; nlocks; nlocks--, readlockp++) {
163 			if (readlockp->rd_rwlock == rwlp) {
164 				if (readlockp->rd_count)
165 					rval = 1;
166 				break;
167 			}
168 		}
169 	}
170 
171 	preempt(self);
172 	return (rval);
173 }
174 
175 /*
176  * Check if a writer version of the lock is held by the current thread.
177  */
178 #pragma weak _rw_write_held = rw_write_held
179 int
180 rw_write_held(rwlock_t *rwlp)
181 {
182 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
183 	uint32_t readers;
184 	ulwp_t *self = curthread;
185 	int rval;
186 
187 	no_preempt(self);
188 
189 	readers = *rwstate;
190 	ASSERT_CONSISTENT_STATE(readers);
191 	rval = ((readers & URW_WRITE_LOCKED) &&
192 	    rwlp->rwlock_owner == (uintptr_t)self &&
193 	    (rwlp->rwlock_type == USYNC_THREAD ||
194 	    rwlp->rwlock_ownerpid == self->ul_uberdata->pid));
195 
196 	preempt(self);
197 	return (rval);
198 }
199 
200 #pragma weak _rwlock_init = rwlock_init
201 /* ARGSUSED2 */
202 int
203 rwlock_init(rwlock_t *rwlp, int type, void *arg)
204 {
205 	ulwp_t *self = curthread;
206 
207 	if (type != USYNC_THREAD && type != USYNC_PROCESS)
208 		return (EINVAL);
209 	/*
210 	 * Once reinitialized, we can no longer be holding a read or write lock.
211 	 * We can do nothing about other threads that are holding read locks.
212 	 */
213 	sigoff(self);
214 	rwl_entry(rwlp)->rd_count = 0;
215 	sigon(self);
216 	(void) memset(rwlp, 0, sizeof (*rwlp));
217 	rwlp->rwlock_type = (uint16_t)type;
218 	rwlp->rwlock_magic = RWL_MAGIC;
219 	rwlp->mutex.mutex_type = (uint8_t)type;
220 	rwlp->mutex.mutex_flag = LOCK_INITED;
221 	rwlp->mutex.mutex_magic = MUTEX_MAGIC;
222 
223 	/*
224 	 * This should be at the beginning of the function,
225 	 * but for the sake of old broken applications that
226 	 * do not have proper alignment for their rwlocks
227 	 * (and don't check the return code from rwlock_init),
228 	 * we put it here, after initializing the rwlock regardless.
229 	 */
230 	if (((uintptr_t)rwlp & (_LONG_LONG_ALIGNMENT - 1)) &&
231 	    self->ul_misaligned == 0)
232 		return (EINVAL);
233 
234 	return (0);
235 }
236 
237 #pragma weak pthread_rwlock_destroy = rwlock_destroy
238 #pragma weak _rwlock_destroy = rwlock_destroy
239 int
240 rwlock_destroy(rwlock_t *rwlp)
241 {
242 	/*
243 	 * Once destroyed, we can no longer be holding a read or write lock.
244 	 * We can do nothing about other threads that are holding read locks.
245 	 */
246 	sigoff(curthread);
247 	rwl_entry(rwlp)->rd_count = 0;
248 	sigon(curthread);
249 	rwlp->rwlock_magic = 0;
250 	tdb_sync_obj_deregister(rwlp);
251 	return (0);
252 }
253 
254 /*
255  * Attempt to acquire a readers lock.  Return true on success.
256  */
257 static int
258 read_lock_try(rwlock_t *rwlp, int ignore_waiters_flag)
259 {
260 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
261 	uint32_t mask = ignore_waiters_flag?
262 	    URW_WRITE_LOCKED : (URW_HAS_WAITERS | URW_WRITE_LOCKED);
263 	uint32_t readers;
264 	ulwp_t *self = curthread;
265 
266 	no_preempt(self);
267 	while (((readers = *rwstate) & mask) == 0) {
268 		if (atomic_cas_32(rwstate, readers, readers + 1) == readers) {
269 			preempt(self);
270 			return (1);
271 		}
272 	}
273 	preempt(self);
274 	return (0);
275 }
276 
277 /*
278  * Attempt to release a reader lock.  Return true on success.
279  */
280 static int
281 read_unlock_try(rwlock_t *rwlp)
282 {
283 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
284 	uint32_t readers;
285 	ulwp_t *self = curthread;
286 
287 	no_preempt(self);
288 	while (((readers = *rwstate) & URW_HAS_WAITERS) == 0) {
289 		if (atomic_cas_32(rwstate, readers, readers - 1) == readers) {
290 			preempt(self);
291 			return (1);
292 		}
293 	}
294 	preempt(self);
295 	return (0);
296 }
297 
298 /*
299  * Attempt to acquire a writer lock.  Return true on success.
300  */
301 static int
302 write_lock_try(rwlock_t *rwlp, int ignore_waiters_flag)
303 {
304 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
305 	uint32_t mask = ignore_waiters_flag?
306 	    (URW_WRITE_LOCKED | URW_READERS_MASK) :
307 	    (URW_HAS_WAITERS | URW_WRITE_LOCKED | URW_READERS_MASK);
308 	ulwp_t *self = curthread;
309 	uint32_t readers;
310 
311 	no_preempt(self);
312 	while (((readers = *rwstate) & mask) == 0) {
313 		if (atomic_cas_32(rwstate, readers, readers | URW_WRITE_LOCKED)
314 		    == readers) {
315 			preempt(self);
316 			return (1);
317 		}
318 	}
319 	preempt(self);
320 	return (0);
321 }
322 
323 /*
324  * Attempt to release a writer lock.  Return true on success.
325  */
326 static int
327 write_unlock_try(rwlock_t *rwlp)
328 {
329 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
330 	uint32_t readers;
331 	ulwp_t *self = curthread;
332 
333 	no_preempt(self);
334 	while (((readers = *rwstate) & URW_HAS_WAITERS) == 0) {
335 		if (atomic_cas_32(rwstate, readers, 0) == readers) {
336 			preempt(self);
337 			return (1);
338 		}
339 	}
340 	preempt(self);
341 	return (0);
342 }
343 
344 /*
345  * Wake up thread(s) sleeping on the rwlock queue and then
346  * drop the queue lock.  Return non-zero if we wake up someone.
347  * This is called when a thread releases a lock that appears to have waiters.
348  */
349 static int
350 rw_queue_release(queue_head_t *qp, rwlock_t *rwlp)
351 {
352 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
353 	uint32_t readers;
354 	uint32_t writers;
355 	ulwp_t **ulwpp;
356 	ulwp_t *ulwp;
357 	ulwp_t *prev;
358 	int nlwpid = 0;
359 	int more;
360 	int maxlwps = MAXLWPS;
361 	lwpid_t buffer[MAXLWPS];
362 	lwpid_t *lwpid = buffer;
363 
364 	readers = *rwstate;
365 	ASSERT_CONSISTENT_STATE(readers);
366 	if (!(readers & URW_HAS_WAITERS)) {
367 		queue_unlock(qp);
368 		return (0);
369 	}
370 	readers &= URW_READERS_MASK;
371 	writers = 0;
372 
373 	/*
374 	 * Examine the queue of waiters in priority order and prepare
375 	 * to wake up as many readers as we encounter before encountering
376 	 * a writer.  If the highest priority thread on the queue is a
377 	 * writer, stop there and wake it up.
378 	 *
379 	 * We keep track of lwpids that are to be unparked in lwpid[].
380 	 * __lwp_unpark_all() is called to unpark all of them after
381 	 * they have been removed from the sleep queue and the sleep
382 	 * queue lock has been dropped.  If we run out of space in our
383 	 * on-stack buffer, we need to allocate more but we can't call
384 	 * lmalloc() because we are holding a queue lock when the overflow
385 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
386 	 * either because the application may have allocated a small
387 	 * stack and we don't want to overrun the stack.  So we call
388 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
389 	 * system call directly since that path acquires no locks.
390 	 */
391 	while ((ulwpp = queue_slot(qp, &prev, &more)) != NULL) {
392 		ulwp = *ulwpp;
393 		ASSERT(ulwp->ul_wchan == rwlp);
394 		if (ulwp->ul_writer) {
395 			if (writers != 0 || readers != 0)
396 				break;
397 			/* one writer to wake */
398 			writers++;
399 		} else {
400 			if (writers != 0)
401 				break;
402 			/* at least one reader to wake */
403 			readers++;
404 			if (nlwpid == maxlwps)
405 				lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
406 		}
407 		queue_unlink(qp, ulwpp, prev);
408 		ulwp->ul_sleepq = NULL;
409 		ulwp->ul_wchan = NULL;
410 		lwpid[nlwpid++] = ulwp->ul_lwpid;
411 	}
412 	if (ulwpp == NULL)
413 		atomic_and_32(rwstate, ~URW_HAS_WAITERS);
414 	if (nlwpid == 0) {
415 		queue_unlock(qp);
416 	} else {
417 		ulwp_t *self = curthread;
418 		no_preempt(self);
419 		queue_unlock(qp);
420 		if (nlwpid == 1)
421 			(void) __lwp_unpark(lwpid[0]);
422 		else
423 			(void) __lwp_unpark_all(lwpid, nlwpid);
424 		preempt(self);
425 	}
426 	if (lwpid != buffer)
427 		(void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
428 	return (nlwpid != 0);
429 }
430 
431 /*
432  * Common code for rdlock, timedrdlock, wrlock, timedwrlock, tryrdlock,
433  * and trywrlock for process-shared (USYNC_PROCESS) rwlocks.
434  *
435  * Note: if the lock appears to be contended we call __lwp_rwlock_rdlock()
436  * or __lwp_rwlock_wrlock() holding the mutex. These return with the mutex
437  * released, and if they need to sleep will release the mutex first. In the
438  * event of a spurious wakeup, these will return EAGAIN (because it is much
439  * easier for us to re-acquire the mutex here).
440  */
441 int
442 shared_rwlock_lock(rwlock_t *rwlp, timespec_t *tsp, int rd_wr)
443 {
444 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
445 	mutex_t *mp = &rwlp->mutex;
446 	uint32_t readers;
447 	int try_flag;
448 	int error;
449 
450 	try_flag = (rd_wr & TRY_FLAG);
451 	rd_wr &= ~TRY_FLAG;
452 	ASSERT(rd_wr == READ_LOCK || rd_wr == WRITE_LOCK);
453 
454 	if (!try_flag) {
455 		DTRACE_PROBE2(plockstat, rw__block, rwlp, rd_wr);
456 	}
457 
458 	do {
459 		if (try_flag && (*rwstate & URW_WRITE_LOCKED)) {
460 			error = EBUSY;
461 			break;
462 		}
463 		if ((error = mutex_lock(mp)) != 0)
464 			break;
465 		if (rd_wr == READ_LOCK) {
466 			if (read_lock_try(rwlp, 0)) {
467 				(void) mutex_unlock(mp);
468 				break;
469 			}
470 		} else {
471 			if (write_lock_try(rwlp, 0)) {
472 				(void) mutex_unlock(mp);
473 				break;
474 			}
475 		}
476 		atomic_or_32(rwstate, URW_HAS_WAITERS);
477 		readers = *rwstate;
478 		ASSERT_CONSISTENT_STATE(readers);
479 		/*
480 		 * The calls to __lwp_rwlock_*() below will release the mutex,
481 		 * so we need a dtrace probe here.  The owner field of the
482 		 * mutex is cleared in the kernel when the mutex is released,
483 		 * so we should not clear it here.
484 		 */
485 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
486 		/*
487 		 * The waiters bit may be inaccurate.
488 		 * Only the kernel knows for sure.
489 		 */
490 		if (rd_wr == READ_LOCK) {
491 			if (try_flag)
492 				error = __lwp_rwlock_tryrdlock(rwlp);
493 			else
494 				error = __lwp_rwlock_rdlock(rwlp, tsp);
495 		} else {
496 			if (try_flag)
497 				error = __lwp_rwlock_trywrlock(rwlp);
498 			else
499 				error = __lwp_rwlock_wrlock(rwlp, tsp);
500 		}
501 	} while (error == EAGAIN || error == EINTR);
502 
503 	if (!try_flag) {
504 		DTRACE_PROBE3(plockstat, rw__blocked, rwlp, rd_wr, error == 0);
505 	}
506 
507 	return (error);
508 }
509 
510 /*
511  * Common code for rdlock, timedrdlock, wrlock, timedwrlock, tryrdlock,
512  * and trywrlock for process-private (USYNC_THREAD) rwlocks.
513  */
514 int
515 rwlock_lock(rwlock_t *rwlp, timespec_t *tsp, int rd_wr)
516 {
517 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
518 	uint32_t readers;
519 	ulwp_t *self = curthread;
520 	queue_head_t *qp;
521 	ulwp_t *ulwp;
522 	int try_flag;
523 	int ignore_waiters_flag;
524 	int error = 0;
525 
526 	try_flag = (rd_wr & TRY_FLAG);
527 	rd_wr &= ~TRY_FLAG;
528 	ASSERT(rd_wr == READ_LOCK || rd_wr == WRITE_LOCK);
529 
530 	if (!try_flag) {
531 		DTRACE_PROBE2(plockstat, rw__block, rwlp, rd_wr);
532 	}
533 
534 	qp = queue_lock(rwlp, MX);
535 	/* initial attempt to acquire the lock fails if there are waiters */
536 	ignore_waiters_flag = 0;
537 	while (error == 0) {
538 		if (rd_wr == READ_LOCK) {
539 			if (read_lock_try(rwlp, ignore_waiters_flag))
540 				break;
541 		} else {
542 			if (write_lock_try(rwlp, ignore_waiters_flag))
543 				break;
544 		}
545 		/* subsequent attempts do not fail due to waiters */
546 		ignore_waiters_flag = 1;
547 		atomic_or_32(rwstate, URW_HAS_WAITERS);
548 		readers = *rwstate;
549 		ASSERT_CONSISTENT_STATE(readers);
550 		if ((readers & URW_WRITE_LOCKED) ||
551 		    (rd_wr == WRITE_LOCK &&
552 		    (readers & URW_READERS_MASK) != 0))
553 			/* EMPTY */;	/* somebody holds the lock */
554 		else if ((ulwp = queue_waiter(qp)) == NULL) {
555 			atomic_and_32(rwstate, ~URW_HAS_WAITERS);
556 			continue;	/* no queued waiters, try again */
557 		} else {
558 			/*
559 			 * Do a priority check on the queued waiter (the
560 			 * highest priority thread on the queue) to see
561 			 * if we should defer to him or just grab the lock.
562 			 */
563 			int our_pri = real_priority(self);
564 			int his_pri = real_priority(ulwp);
565 
566 			if (rd_wr == WRITE_LOCK) {
567 				/*
568 				 * We defer to a queued thread that has
569 				 * a higher priority than ours.
570 				 */
571 				if (his_pri <= our_pri)
572 					continue;	/* try again */
573 			} else {
574 				/*
575 				 * We defer to a queued thread that has
576 				 * a higher priority than ours or that
577 				 * is a writer whose priority equals ours.
578 				 */
579 				if (his_pri < our_pri ||
580 				    (his_pri == our_pri && !ulwp->ul_writer))
581 					continue;	/* try again */
582 			}
583 		}
584 		/*
585 		 * We are about to block.
586 		 * If we're doing a trylock, return EBUSY instead.
587 		 */
588 		if (try_flag) {
589 			error = EBUSY;
590 			break;
591 		}
592 		/*
593 		 * Enqueue writers ahead of readers.
594 		 */
595 		self->ul_writer = rd_wr;	/* *must* be 0 or 1 */
596 		enqueue(qp, self, 0);
597 		set_parking_flag(self, 1);
598 		queue_unlock(qp);
599 		if ((error = __lwp_park(tsp, 0)) == EINTR)
600 			error = ignore_waiters_flag = 0;
601 		set_parking_flag(self, 0);
602 		qp = queue_lock(rwlp, MX);
603 		if (self->ul_sleepq && dequeue_self(qp) == 0)
604 			atomic_and_32(rwstate, ~URW_HAS_WAITERS);
605 		self->ul_writer = 0;
606 	}
607 
608 	queue_unlock(qp);
609 
610 	if (!try_flag) {
611 		DTRACE_PROBE3(plockstat, rw__blocked, rwlp, rd_wr, error == 0);
612 	}
613 
614 	return (error);
615 }
616 
617 int
618 rw_rdlock_impl(rwlock_t *rwlp, timespec_t *tsp)
619 {
620 	ulwp_t *self = curthread;
621 	uberdata_t *udp = self->ul_uberdata;
622 	readlock_t *readlockp;
623 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
624 	int error;
625 
626 	/*
627 	 * If we already hold a readers lock on this rwlock,
628 	 * just increment our reference count and return.
629 	 */
630 	sigoff(self);
631 	readlockp = rwl_entry(rwlp);
632 	if (readlockp->rd_count != 0) {
633 		if (readlockp->rd_count == READ_LOCK_MAX) {
634 			sigon(self);
635 			error = EAGAIN;
636 			goto out;
637 		}
638 		sigon(self);
639 		error = 0;
640 		goto out;
641 	}
642 	sigon(self);
643 
644 	/*
645 	 * If we hold the writer lock, bail out.
646 	 */
647 	if (rw_write_held(rwlp)) {
648 		if (self->ul_error_detection)
649 			rwlock_error(rwlp, "rwlock_rdlock",
650 			    "calling thread owns the writer lock");
651 		error = EDEADLK;
652 		goto out;
653 	}
654 
655 	if (read_lock_try(rwlp, 0))
656 		error = 0;
657 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
658 		error = shared_rwlock_lock(rwlp, tsp, READ_LOCK);
659 	else						/* user-level */
660 		error = rwlock_lock(rwlp, tsp, READ_LOCK);
661 
662 out:
663 	if (error == 0) {
664 		sigoff(self);
665 		rwl_entry(rwlp)->rd_count++;
666 		sigon(self);
667 		if (rwsp)
668 			tdb_incr(rwsp->rw_rdlock);
669 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, READ_LOCK);
670 	} else {
671 		DTRACE_PROBE3(plockstat, rw__error, rwlp, READ_LOCK, error);
672 	}
673 
674 	return (error);
675 }
676 
677 #pragma weak pthread_rwlock_rdlock = rw_rdlock
678 #pragma weak _rw_rdlock = rw_rdlock
679 int
680 rw_rdlock(rwlock_t *rwlp)
681 {
682 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
683 	return (rw_rdlock_impl(rwlp, NULL));
684 }
685 
686 void
687 lrw_rdlock(rwlock_t *rwlp)
688 {
689 	enter_critical(curthread);
690 	(void) rw_rdlock_impl(rwlp, NULL);
691 }
692 
693 int
694 pthread_rwlock_reltimedrdlock_np(pthread_rwlock_t *_RESTRICT_KYWD rwlp,
695     const struct timespec *_RESTRICT_KYWD reltime)
696 {
697 	timespec_t tslocal = *reltime;
698 	int error;
699 
700 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
701 	error = rw_rdlock_impl((rwlock_t *)rwlp, &tslocal);
702 	if (error == ETIME)
703 		error = ETIMEDOUT;
704 	return (error);
705 }
706 
707 int
708 pthread_rwlock_timedrdlock(pthread_rwlock_t *_RESTRICT_KYWD rwlp,
709     const struct timespec *_RESTRICT_KYWD abstime)
710 {
711 	timespec_t tslocal;
712 	int error;
713 
714 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
715 	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
716 	error = rw_rdlock_impl((rwlock_t *)rwlp, &tslocal);
717 	if (error == ETIME)
718 		error = ETIMEDOUT;
719 	return (error);
720 }
721 
722 int
723 rw_wrlock_impl(rwlock_t *rwlp, timespec_t *tsp)
724 {
725 	ulwp_t *self = curthread;
726 	uberdata_t *udp = self->ul_uberdata;
727 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
728 	int error;
729 
730 	/*
731 	 * If we hold a readers lock on this rwlock, bail out.
732 	 */
733 	if (rw_read_held(rwlp)) {
734 		if (self->ul_error_detection)
735 			rwlock_error(rwlp, "rwlock_wrlock",
736 			    "calling thread owns the readers lock");
737 		error = EDEADLK;
738 		goto out;
739 	}
740 
741 	/*
742 	 * If we hold the writer lock, bail out.
743 	 */
744 	if (rw_write_held(rwlp)) {
745 		if (self->ul_error_detection)
746 			rwlock_error(rwlp, "rwlock_wrlock",
747 			    "calling thread owns the writer lock");
748 		error = EDEADLK;
749 		goto out;
750 	}
751 
752 	if (write_lock_try(rwlp, 0))
753 		error = 0;
754 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
755 		error = shared_rwlock_lock(rwlp, tsp, WRITE_LOCK);
756 	else						/* user-level */
757 		error = rwlock_lock(rwlp, tsp, WRITE_LOCK);
758 
759 out:
760 	if (error == 0) {
761 		rwlp->rwlock_owner = (uintptr_t)self;
762 		if (rwlp->rwlock_type == USYNC_PROCESS)
763 			rwlp->rwlock_ownerpid = udp->pid;
764 		if (rwsp) {
765 			tdb_incr(rwsp->rw_wrlock);
766 			rwsp->rw_wrlock_begin_hold = gethrtime();
767 		}
768 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, WRITE_LOCK);
769 	} else {
770 		DTRACE_PROBE3(plockstat, rw__error, rwlp, WRITE_LOCK, error);
771 	}
772 	return (error);
773 }
774 
775 #pragma weak pthread_rwlock_wrlock = rw_wrlock
776 #pragma weak _rw_wrlock = rw_wrlock
777 int
778 rw_wrlock(rwlock_t *rwlp)
779 {
780 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
781 	return (rw_wrlock_impl(rwlp, NULL));
782 }
783 
784 void
785 lrw_wrlock(rwlock_t *rwlp)
786 {
787 	enter_critical(curthread);
788 	(void) rw_wrlock_impl(rwlp, NULL);
789 }
790 
791 int
792 pthread_rwlock_reltimedwrlock_np(pthread_rwlock_t *_RESTRICT_KYWD rwlp,
793     const struct timespec *_RESTRICT_KYWD reltime)
794 {
795 	timespec_t tslocal = *reltime;
796 	int error;
797 
798 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
799 	error = rw_wrlock_impl((rwlock_t *)rwlp, &tslocal);
800 	if (error == ETIME)
801 		error = ETIMEDOUT;
802 	return (error);
803 }
804 
805 int
806 pthread_rwlock_timedwrlock(pthread_rwlock_t *rwlp, const timespec_t *abstime)
807 {
808 	timespec_t tslocal;
809 	int error;
810 
811 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
812 	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
813 	error = rw_wrlock_impl((rwlock_t *)rwlp, &tslocal);
814 	if (error == ETIME)
815 		error = ETIMEDOUT;
816 	return (error);
817 }
818 
819 #pragma weak pthread_rwlock_tryrdlock = rw_tryrdlock
820 int
821 rw_tryrdlock(rwlock_t *rwlp)
822 {
823 	ulwp_t *self = curthread;
824 	uberdata_t *udp = self->ul_uberdata;
825 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
826 	readlock_t *readlockp;
827 	int error;
828 
829 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
830 
831 	if (rwsp)
832 		tdb_incr(rwsp->rw_rdlock_try);
833 
834 	/*
835 	 * If we already hold a readers lock on this rwlock,
836 	 * just increment our reference count and return.
837 	 */
838 	sigoff(self);
839 	readlockp = rwl_entry(rwlp);
840 	if (readlockp->rd_count != 0) {
841 		if (readlockp->rd_count == READ_LOCK_MAX) {
842 			sigon(self);
843 			error = EAGAIN;
844 			goto out;
845 		}
846 		sigon(self);
847 		error = 0;
848 		goto out;
849 	}
850 	sigon(self);
851 
852 	if (read_lock_try(rwlp, 0))
853 		error = 0;
854 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
855 		error = shared_rwlock_lock(rwlp, NULL, READ_LOCK_TRY);
856 	else						/* user-level */
857 		error = rwlock_lock(rwlp, NULL, READ_LOCK_TRY);
858 
859 out:
860 	if (error == 0) {
861 		sigoff(self);
862 		rwl_entry(rwlp)->rd_count++;
863 		sigon(self);
864 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, READ_LOCK);
865 	} else {
866 		if (rwsp)
867 			tdb_incr(rwsp->rw_rdlock_try_fail);
868 		if (error != EBUSY) {
869 			DTRACE_PROBE3(plockstat, rw__error, rwlp, READ_LOCK,
870 			    error);
871 		}
872 	}
873 
874 	return (error);
875 }
876 
877 #pragma weak pthread_rwlock_trywrlock = rw_trywrlock
878 int
879 rw_trywrlock(rwlock_t *rwlp)
880 {
881 	ulwp_t *self = curthread;
882 	uberdata_t *udp = self->ul_uberdata;
883 	tdb_rwlock_stats_t *rwsp = RWLOCK_STATS(rwlp, udp);
884 	int error;
885 
886 	ASSERT(!self->ul_critical || self->ul_bindflags);
887 
888 	if (rwsp)
889 		tdb_incr(rwsp->rw_wrlock_try);
890 
891 	if (write_lock_try(rwlp, 0))
892 		error = 0;
893 	else if (rwlp->rwlock_type == USYNC_PROCESS)	/* kernel-level */
894 		error = shared_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY);
895 	else						/* user-level */
896 		error = rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY);
897 
898 	if (error == 0) {
899 		rwlp->rwlock_owner = (uintptr_t)self;
900 		if (rwlp->rwlock_type == USYNC_PROCESS)
901 			rwlp->rwlock_ownerpid = udp->pid;
902 		if (rwsp)
903 			rwsp->rw_wrlock_begin_hold = gethrtime();
904 		DTRACE_PROBE2(plockstat, rw__acquire, rwlp, WRITE_LOCK);
905 	} else {
906 		if (rwsp)
907 			tdb_incr(rwsp->rw_wrlock_try_fail);
908 		if (error != EBUSY) {
909 			DTRACE_PROBE3(plockstat, rw__error, rwlp, WRITE_LOCK,
910 			    error);
911 		}
912 	}
913 	return (error);
914 }
915 
916 #pragma weak pthread_rwlock_unlock = rw_unlock
917 #pragma weak _rw_unlock = rw_unlock
918 int
919 rw_unlock(rwlock_t *rwlp)
920 {
921 	volatile uint32_t *rwstate = (volatile uint32_t *)&rwlp->rwlock_readers;
922 	uint32_t readers;
923 	ulwp_t *self = curthread;
924 	uberdata_t *udp = self->ul_uberdata;
925 	tdb_rwlock_stats_t *rwsp;
926 	queue_head_t *qp;
927 	int rd_wr;
928 	int waked = 0;
929 
930 	readers = *rwstate;
931 	ASSERT_CONSISTENT_STATE(readers);
932 	if (readers & URW_WRITE_LOCKED) {
933 		rd_wr = WRITE_LOCK;
934 		readers = 0;
935 	} else {
936 		rd_wr = READ_LOCK;
937 		readers &= URW_READERS_MASK;
938 	}
939 
940 	if (rd_wr == WRITE_LOCK) {
941 		/*
942 		 * Since the writer lock is held, we'd better be
943 		 * holding it, else we cannot legitimately be here.
944 		 */
945 		if (!rw_write_held(rwlp)) {
946 			if (self->ul_error_detection)
947 				rwlock_error(rwlp, "rwlock_unlock",
948 				    "writer lock held, "
949 				    "but not by the calling thread");
950 			return (EPERM);
951 		}
952 		if ((rwsp = RWLOCK_STATS(rwlp, udp)) != NULL) {
953 			if (rwsp->rw_wrlock_begin_hold)
954 				rwsp->rw_wrlock_hold_time +=
955 				    gethrtime() - rwsp->rw_wrlock_begin_hold;
956 			rwsp->rw_wrlock_begin_hold = 0;
957 		}
958 		rwlp->rwlock_owner = 0;
959 		rwlp->rwlock_ownerpid = 0;
960 	} else if (readers > 0) {
961 		/*
962 		 * A readers lock is held; if we don't hold one, bail out.
963 		 */
964 		readlock_t *readlockp;
965 
966 		sigoff(self);
967 		readlockp = rwl_entry(rwlp);
968 		if (readlockp->rd_count == 0) {
969 			sigon(self);
970 			if (self->ul_error_detection)
971 				rwlock_error(rwlp, "rwlock_unlock",
972 				    "readers lock held, "
973 				    "but not by the calling thread");
974 			return (EPERM);
975 		}
976 		/*
977 		 * If we hold more than one readers lock on this rwlock,
978 		 * just decrement our reference count and return.
979 		 */
980 		if (--readlockp->rd_count != 0) {
981 			sigon(self);
982 			goto out;
983 		}
984 		sigon(self);
985 	} else {
986 		/*
987 		 * This is a usage error.
988 		 * No thread should release an unowned lock.
989 		 */
990 		if (self->ul_error_detection)
991 			rwlock_error(rwlp, "rwlock_unlock", "lock not owned");
992 		return (EPERM);
993 	}
994 
995 	if (rd_wr == WRITE_LOCK && write_unlock_try(rwlp)) {
996 		/* EMPTY */;
997 	} else if (rd_wr == READ_LOCK && read_unlock_try(rwlp)) {
998 		/* EMPTY */;
999 	} else if (rwlp->rwlock_type == USYNC_PROCESS) {
1000 		(void) mutex_lock(&rwlp->mutex);
1001 		(void) __lwp_rwlock_unlock(rwlp);
1002 		(void) mutex_unlock(&rwlp->mutex);
1003 		waked = 1;
1004 	} else {
1005 		qp = queue_lock(rwlp, MX);
1006 		if (rd_wr == READ_LOCK)
1007 			atomic_dec_32(rwstate);
1008 		else
1009 			atomic_and_32(rwstate, ~URW_WRITE_LOCKED);
1010 		waked = rw_queue_release(qp, rwlp);
1011 	}
1012 
1013 out:
1014 	DTRACE_PROBE2(plockstat, rw__release, rwlp, rd_wr);
1015 
1016 	/*
1017 	 * Yield to the thread we just waked up, just in case we might
1018 	 * be about to grab the rwlock again immediately upon return.
1019 	 * This is pretty weak but it helps on a uniprocessor and also
1020 	 * when cpu affinity has assigned both ourself and the other
1021 	 * thread to the same CPU.  Note that lwp_yield() will yield
1022 	 * the processor only if the writer is at the same or higher
1023 	 * priority than ourself.  This provides more balanced program
1024 	 * behavior; it doesn't guarantee acquisition of the lock by
1025 	 * the pending writer.
1026 	 */
1027 	if (waked)
1028 		yield();
1029 	return (0);
1030 }
1031 
1032 void
1033 lrw_unlock(rwlock_t *rwlp)
1034 {
1035 	(void) rw_unlock(rwlp);
1036 	exit_critical(curthread);
1037 }
1038