xref: /titanic_50/usr/src/uts/common/os/rwlock.c (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1*7c478bd9Sstevel@tonic-gate /*
2*7c478bd9Sstevel@tonic-gate  * CDDL HEADER START
3*7c478bd9Sstevel@tonic-gate  *
4*7c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*7c478bd9Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*7c478bd9Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*7c478bd9Sstevel@tonic-gate  * with the License.
8*7c478bd9Sstevel@tonic-gate  *
9*7c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*7c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*7c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*7c478bd9Sstevel@tonic-gate  * and limitations under the License.
13*7c478bd9Sstevel@tonic-gate  *
14*7c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*7c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*7c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*7c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*7c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*7c478bd9Sstevel@tonic-gate  *
20*7c478bd9Sstevel@tonic-gate  * CDDL HEADER END
21*7c478bd9Sstevel@tonic-gate  */
22*7c478bd9Sstevel@tonic-gate /*
23*7c478bd9Sstevel@tonic-gate  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24*7c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
25*7c478bd9Sstevel@tonic-gate  */
26*7c478bd9Sstevel@tonic-gate 
27*7c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*7c478bd9Sstevel@tonic-gate 
29*7c478bd9Sstevel@tonic-gate #include <sys/param.h>
30*7c478bd9Sstevel@tonic-gate #include <sys/thread.h>
31*7c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
32*7c478bd9Sstevel@tonic-gate #include <sys/debug.h>
33*7c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
34*7c478bd9Sstevel@tonic-gate #include <sys/sobject.h>
35*7c478bd9Sstevel@tonic-gate #include <sys/turnstile.h>
36*7c478bd9Sstevel@tonic-gate #include <sys/rwlock.h>
37*7c478bd9Sstevel@tonic-gate #include <sys/rwlock_impl.h>
38*7c478bd9Sstevel@tonic-gate #include <sys/atomic.h>
39*7c478bd9Sstevel@tonic-gate #include <sys/lockstat.h>
40*7c478bd9Sstevel@tonic-gate 
41*7c478bd9Sstevel@tonic-gate /*
42*7c478bd9Sstevel@tonic-gate  * Big Theory Statement for readers/writer locking primitives.
43*7c478bd9Sstevel@tonic-gate  *
44*7c478bd9Sstevel@tonic-gate  * An rwlock provides exclusive access to a single thread ("writer") or
45*7c478bd9Sstevel@tonic-gate  * concurrent access to multiple threads ("readers").  See rwlock(9F)
46*7c478bd9Sstevel@tonic-gate  * for a full description of the interfaces and programming model.
47*7c478bd9Sstevel@tonic-gate  * The rest of this comment describes the implementation.
48*7c478bd9Sstevel@tonic-gate  *
49*7c478bd9Sstevel@tonic-gate  * An rwlock is a single word with the following structure:
50*7c478bd9Sstevel@tonic-gate  *
51*7c478bd9Sstevel@tonic-gate  *	---------------------------------------------------------------------
52*7c478bd9Sstevel@tonic-gate  *	| OWNER (writer) or HOLD COUNT (readers)   | WRLOCK | WRWANT | WAIT |
53*7c478bd9Sstevel@tonic-gate  *	---------------------------------------------------------------------
54*7c478bd9Sstevel@tonic-gate  *			63 / 31 .. 3			2	1	0
55*7c478bd9Sstevel@tonic-gate  *
56*7c478bd9Sstevel@tonic-gate  * The waiters bit (0) indicates whether any threads are blocked waiting
57*7c478bd9Sstevel@tonic-gate  * for the lock.  The write-wanted bit (1) indicates whether any threads
58*7c478bd9Sstevel@tonic-gate  * are blocked waiting for write access.  The write-locked bit (2) indicates
59*7c478bd9Sstevel@tonic-gate  * whether the lock is held by a writer, which determines whether the upper
60*7c478bd9Sstevel@tonic-gate  * bits (3..31 in ILP32, 3..63 in LP64) should be interpreted as the owner
61*7c478bd9Sstevel@tonic-gate  * (thread pointer) or the hold count (number of readers).
62*7c478bd9Sstevel@tonic-gate  *
63*7c478bd9Sstevel@tonic-gate  * In the absence of any contention, a writer gets the lock by setting
64*7c478bd9Sstevel@tonic-gate  * this word to (curthread | RW_WRITE_LOCKED); a reader gets the lock
65*7c478bd9Sstevel@tonic-gate  * by incrementing the hold count (i.e. adding 8, aka RW_READ_LOCK).
66*7c478bd9Sstevel@tonic-gate  *
67*7c478bd9Sstevel@tonic-gate  * A writer will fail to acquire the lock if any other thread owns it.
68*7c478bd9Sstevel@tonic-gate  * A reader will fail if the lock is either owned or wanted by a writer.
69*7c478bd9Sstevel@tonic-gate  * rw_tryenter() returns 0 in these cases; rw_enter() blocks until the
70*7c478bd9Sstevel@tonic-gate  * lock becomes available.
71*7c478bd9Sstevel@tonic-gate  *
72*7c478bd9Sstevel@tonic-gate  * When a thread blocks it acquires the rwlock's hashed turnstile lock and
73*7c478bd9Sstevel@tonic-gate  * attempts to set RW_HAS_WAITERS (and RW_WRITE_WANTED in the writer case)
74*7c478bd9Sstevel@tonic-gate  * atomically *only if the lock still appears busy*.  A thread must never
75*7c478bd9Sstevel@tonic-gate  * accidentally block for an available lock since there would be no owner
76*7c478bd9Sstevel@tonic-gate  * to awaken it.  casip() provides the required atomicity.  Once casip()
77*7c478bd9Sstevel@tonic-gate  * succeeds, the decision to block becomes final and irreversible.  The
78*7c478bd9Sstevel@tonic-gate  * thread will not become runnable again until it has been granted ownership
79*7c478bd9Sstevel@tonic-gate  * of the lock via direct handoff from a former owner as described below.
80*7c478bd9Sstevel@tonic-gate  *
81*7c478bd9Sstevel@tonic-gate  * In the absence of any waiters, rw_exit() just clears the lock (if it
82*7c478bd9Sstevel@tonic-gate  * is write-locked) or decrements the hold count (if it is read-locked).
83*7c478bd9Sstevel@tonic-gate  * Note that even if waiters are present, decrementing the hold count
84*7c478bd9Sstevel@tonic-gate  * to a non-zero value requires no special action since the lock is still
85*7c478bd9Sstevel@tonic-gate  * held by at least one other thread.
86*7c478bd9Sstevel@tonic-gate  *
87*7c478bd9Sstevel@tonic-gate  * On the "final exit" (transition to unheld state) of a lock with waiters,
88*7c478bd9Sstevel@tonic-gate  * rw_exit_wakeup() grabs the turnstile lock and transfers ownership directly
89*7c478bd9Sstevel@tonic-gate  * to the next writer or set of readers.  There are several advantages to this
90*7c478bd9Sstevel@tonic-gate  * approach: (1) it closes all windows for priority inversion (when a new
91*7c478bd9Sstevel@tonic-gate  * writer has grabbed the lock but has not yet inherited from blocked readers);
92*7c478bd9Sstevel@tonic-gate  * (2) it prevents starvation of equal-priority threads by granting the lock
93*7c478bd9Sstevel@tonic-gate  * in FIFO order; (3) it eliminates the need for a write-wanted count -- a
94*7c478bd9Sstevel@tonic-gate  * single bit suffices because the lock remains held until all waiting
95*7c478bd9Sstevel@tonic-gate  * writers are gone; (4) when we awaken N readers we can perform a single
96*7c478bd9Sstevel@tonic-gate  * "atomic_add(&x, N)" to set the total hold count rather than having all N
97*7c478bd9Sstevel@tonic-gate  * threads fight for the cache to perform an "atomic_add(&x, 1)" upon wakeup.
98*7c478bd9Sstevel@tonic-gate  *
99*7c478bd9Sstevel@tonic-gate  * The most interesting policy decision in rw_exit_wakeup() is which thread
100*7c478bd9Sstevel@tonic-gate  * to wake.  Starvation is always possible with priority-based scheduling,
101*7c478bd9Sstevel@tonic-gate  * but any sane wakeup policy should at least satisfy these requirements:
102*7c478bd9Sstevel@tonic-gate  *
103*7c478bd9Sstevel@tonic-gate  * (1) The highest-priority thread in the system should not starve.
104*7c478bd9Sstevel@tonic-gate  * (2) The highest-priority writer should not starve.
105*7c478bd9Sstevel@tonic-gate  * (3) No writer should starve due to lower-priority threads.
106*7c478bd9Sstevel@tonic-gate  * (4) No reader should starve due to lower-priority writers.
107*7c478bd9Sstevel@tonic-gate  * (5) If all threads have equal priority, none of them should starve.
108*7c478bd9Sstevel@tonic-gate  *
109*7c478bd9Sstevel@tonic-gate  * We used to employ a writers-always-win policy, which doesn't even
110*7c478bd9Sstevel@tonic-gate  * satisfy (1): a steady stream of low-priority writers can starve out
111*7c478bd9Sstevel@tonic-gate  * a real-time reader!  This is clearly a broken policy -- it violates
112*7c478bd9Sstevel@tonic-gate  * (1), (4), and (5) -- but it's how rwlocks always used to behave.
113*7c478bd9Sstevel@tonic-gate  *
114*7c478bd9Sstevel@tonic-gate  * A round-robin policy (exiting readers grant the lock to blocked writers
115*7c478bd9Sstevel@tonic-gate  * and vice versa) satisfies all but (3): a single high-priority writer
116*7c478bd9Sstevel@tonic-gate  * and many low-priority readers can starve out medium-priority writers.
117*7c478bd9Sstevel@tonic-gate  *
118*7c478bd9Sstevel@tonic-gate  * A strict priority policy (grant the lock to the highest priority blocked
119*7c478bd9Sstevel@tonic-gate  * thread) satisfies everything but (2): a steady stream of high-priority
120*7c478bd9Sstevel@tonic-gate  * readers can permanently starve the highest-priority writer.
121*7c478bd9Sstevel@tonic-gate  *
122*7c478bd9Sstevel@tonic-gate  * The reason we care about (2) is that it's important to process writers
123*7c478bd9Sstevel@tonic-gate  * reasonably quickly -- even if they're low priority -- because their very
124*7c478bd9Sstevel@tonic-gate  * presence causes all readers to take the slow (blocking) path through this
125*7c478bd9Sstevel@tonic-gate  * code.  There is also a general sense that writers deserve some degree of
126*7c478bd9Sstevel@tonic-gate  * deference because they're updating the data upon which all readers act.
127*7c478bd9Sstevel@tonic-gate  * Presumably this data should not be allowed to become arbitrarily stale
128*7c478bd9Sstevel@tonic-gate  * due to writer starvation.  Finally, it seems reasonable to level the
129*7c478bd9Sstevel@tonic-gate  * playing field a bit to compensate for the fact that it's so much harder
130*7c478bd9Sstevel@tonic-gate  * for a writer to get in when there are already many readers present.
131*7c478bd9Sstevel@tonic-gate  *
132*7c478bd9Sstevel@tonic-gate  * A hybrid of round-robin and strict priority can be made to satisfy
133*7c478bd9Sstevel@tonic-gate  * all five criteria.  In this "writer priority policy" exiting readers
134*7c478bd9Sstevel@tonic-gate  * always grant the lock to waiting writers, but exiting writers only
135*7c478bd9Sstevel@tonic-gate  * grant the lock to readers of the same or higher priority than the
136*7c478bd9Sstevel@tonic-gate  * highest-priority blocked writer.  Thus requirement (2) is satisfied,
137*7c478bd9Sstevel@tonic-gate  * necessarily, by a willful act of priority inversion: an exiting reader
138*7c478bd9Sstevel@tonic-gate  * will grant the lock to a blocked writer even if there are blocked
139*7c478bd9Sstevel@tonic-gate  * readers of higher priority.  The situation is mitigated by the fact
140*7c478bd9Sstevel@tonic-gate  * that writers always inherit priority from blocked readers, and the
141*7c478bd9Sstevel@tonic-gate  * writer will awaken those readers as soon as it exits the lock.
142*7c478bd9Sstevel@tonic-gate  *
143*7c478bd9Sstevel@tonic-gate  * rw_downgrade() follows the same wakeup policy as an exiting writer.
144*7c478bd9Sstevel@tonic-gate  *
145*7c478bd9Sstevel@tonic-gate  * rw_tryupgrade() has the same failure mode as rw_tryenter() for a
146*7c478bd9Sstevel@tonic-gate  * write lock.  Both honor the WRITE_WANTED bit by specification.
147*7c478bd9Sstevel@tonic-gate  *
148*7c478bd9Sstevel@tonic-gate  * The following rules apply to manipulation of rwlock internal state:
149*7c478bd9Sstevel@tonic-gate  *
150*7c478bd9Sstevel@tonic-gate  * (1) The rwlock is only modified via the atomic primitives casip()
151*7c478bd9Sstevel@tonic-gate  *     and atomic_add_ip().
152*7c478bd9Sstevel@tonic-gate  *
153*7c478bd9Sstevel@tonic-gate  * (2) The waiters bit and write-wanted bit are only modified under
154*7c478bd9Sstevel@tonic-gate  *     turnstile_lookup().  This ensures that the turnstile is consistent
155*7c478bd9Sstevel@tonic-gate  *     with the rwlock.
156*7c478bd9Sstevel@tonic-gate  *
157*7c478bd9Sstevel@tonic-gate  * (3) Waiters receive the lock by direct handoff from the previous
158*7c478bd9Sstevel@tonic-gate  *     owner.  Therefore, waiters *always* wake up holding the lock.
159*7c478bd9Sstevel@tonic-gate  */
160*7c478bd9Sstevel@tonic-gate 
161*7c478bd9Sstevel@tonic-gate /*
162*7c478bd9Sstevel@tonic-gate  * The sobj_ops vector exports a set of functions needed when a thread
163*7c478bd9Sstevel@tonic-gate  * is asleep on a synchronization object of a given type.
164*7c478bd9Sstevel@tonic-gate  */
165*7c478bd9Sstevel@tonic-gate static sobj_ops_t rw_sobj_ops = {
166*7c478bd9Sstevel@tonic-gate 	SOBJ_RWLOCK, rw_owner, turnstile_stay_asleep, turnstile_change_pri
167*7c478bd9Sstevel@tonic-gate };
168*7c478bd9Sstevel@tonic-gate 
169*7c478bd9Sstevel@tonic-gate /*
170*7c478bd9Sstevel@tonic-gate  * If the system panics on an rwlock, save the address of the offending
171*7c478bd9Sstevel@tonic-gate  * rwlock in panic_rwlock_addr, and save the contents in panic_rwlock.
172*7c478bd9Sstevel@tonic-gate  */
173*7c478bd9Sstevel@tonic-gate static rwlock_impl_t panic_rwlock;
174*7c478bd9Sstevel@tonic-gate static rwlock_impl_t *panic_rwlock_addr;
175*7c478bd9Sstevel@tonic-gate 
176*7c478bd9Sstevel@tonic-gate static void
177*7c478bd9Sstevel@tonic-gate rw_panic(char *msg, rwlock_impl_t *lp)
178*7c478bd9Sstevel@tonic-gate {
179*7c478bd9Sstevel@tonic-gate 	if (panicstr)
180*7c478bd9Sstevel@tonic-gate 		return;
181*7c478bd9Sstevel@tonic-gate 
182*7c478bd9Sstevel@tonic-gate 	if (casptr(&panic_rwlock_addr, NULL, lp) == NULL)
183*7c478bd9Sstevel@tonic-gate 		panic_rwlock = *lp;
184*7c478bd9Sstevel@tonic-gate 
185*7c478bd9Sstevel@tonic-gate 	panic("%s, lp=%p wwwh=%lx thread=%p",
186*7c478bd9Sstevel@tonic-gate 	    msg, lp, panic_rwlock.rw_wwwh, curthread);
187*7c478bd9Sstevel@tonic-gate }
188*7c478bd9Sstevel@tonic-gate 
189*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
190*7c478bd9Sstevel@tonic-gate void
191*7c478bd9Sstevel@tonic-gate rw_init(krwlock_t *rwlp, char *name, krw_type_t type, void *arg)
192*7c478bd9Sstevel@tonic-gate {
193*7c478bd9Sstevel@tonic-gate 	((rwlock_impl_t *)rwlp)->rw_wwwh = 0;
194*7c478bd9Sstevel@tonic-gate }
195*7c478bd9Sstevel@tonic-gate 
196*7c478bd9Sstevel@tonic-gate void
197*7c478bd9Sstevel@tonic-gate rw_destroy(krwlock_t *rwlp)
198*7c478bd9Sstevel@tonic-gate {
199*7c478bd9Sstevel@tonic-gate 	rwlock_impl_t *lp = (rwlock_impl_t *)rwlp;
200*7c478bd9Sstevel@tonic-gate 
201*7c478bd9Sstevel@tonic-gate 	if (lp->rw_wwwh != 0) {
202*7c478bd9Sstevel@tonic-gate 		if ((lp->rw_wwwh & RW_DOUBLE_LOCK) == RW_DOUBLE_LOCK)
203*7c478bd9Sstevel@tonic-gate 			rw_panic("rw_destroy: lock already destroyed", lp);
204*7c478bd9Sstevel@tonic-gate 		else
205*7c478bd9Sstevel@tonic-gate 			rw_panic("rw_destroy: lock still active", lp);
206*7c478bd9Sstevel@tonic-gate 	}
207*7c478bd9Sstevel@tonic-gate 
208*7c478bd9Sstevel@tonic-gate 	lp->rw_wwwh = RW_DOUBLE_LOCK;
209*7c478bd9Sstevel@tonic-gate }
210*7c478bd9Sstevel@tonic-gate 
211*7c478bd9Sstevel@tonic-gate /*
212*7c478bd9Sstevel@tonic-gate  * Verify that an rwlock is held correctly.
213*7c478bd9Sstevel@tonic-gate  */
214*7c478bd9Sstevel@tonic-gate static int
215*7c478bd9Sstevel@tonic-gate rw_locked(rwlock_impl_t *lp, krw_t rw)
216*7c478bd9Sstevel@tonic-gate {
217*7c478bd9Sstevel@tonic-gate 	uintptr_t old = lp->rw_wwwh;
218*7c478bd9Sstevel@tonic-gate 
219*7c478bd9Sstevel@tonic-gate 	if (rw == RW_READER)
220*7c478bd9Sstevel@tonic-gate 		return ((old & RW_LOCKED) && !(old & RW_WRITE_LOCKED));
221*7c478bd9Sstevel@tonic-gate 
222*7c478bd9Sstevel@tonic-gate 	if (rw == RW_WRITER)
223*7c478bd9Sstevel@tonic-gate 		return ((old & RW_OWNER) == (uintptr_t)curthread);
224*7c478bd9Sstevel@tonic-gate 
225*7c478bd9Sstevel@tonic-gate 	return (0);
226*7c478bd9Sstevel@tonic-gate }
227*7c478bd9Sstevel@tonic-gate 
228*7c478bd9Sstevel@tonic-gate /*
229*7c478bd9Sstevel@tonic-gate  * Full-service implementation of rw_enter() to handle all the hard cases.
230*7c478bd9Sstevel@tonic-gate  * Called from the assembly version if anything complicated is going on.
231*7c478bd9Sstevel@tonic-gate  * The only semantic difference between calling rw_enter() and calling
232*7c478bd9Sstevel@tonic-gate  * rw_enter_sleep() directly is that we assume the caller has already done
233*7c478bd9Sstevel@tonic-gate  * a THREAD_KPRI_REQUEST() in the RW_READER case.
234*7c478bd9Sstevel@tonic-gate  */
235*7c478bd9Sstevel@tonic-gate void
236*7c478bd9Sstevel@tonic-gate rw_enter_sleep(rwlock_impl_t *lp, krw_t rw)
237*7c478bd9Sstevel@tonic-gate {
238*7c478bd9Sstevel@tonic-gate 	uintptr_t old, new, lock_value, lock_busy, lock_wait;
239*7c478bd9Sstevel@tonic-gate 	hrtime_t sleep_time;
240*7c478bd9Sstevel@tonic-gate 	turnstile_t *ts;
241*7c478bd9Sstevel@tonic-gate 
242*7c478bd9Sstevel@tonic-gate 	if (rw == RW_READER) {
243*7c478bd9Sstevel@tonic-gate 		lock_value = RW_READ_LOCK;
244*7c478bd9Sstevel@tonic-gate 		lock_busy = RW_WRITE_CLAIMED;
245*7c478bd9Sstevel@tonic-gate 		lock_wait = RW_HAS_WAITERS;
246*7c478bd9Sstevel@tonic-gate 	} else {
247*7c478bd9Sstevel@tonic-gate 		lock_value = RW_WRITE_LOCK(curthread);
248*7c478bd9Sstevel@tonic-gate 		lock_busy = (uintptr_t)RW_LOCKED;
249*7c478bd9Sstevel@tonic-gate 		lock_wait = RW_HAS_WAITERS | RW_WRITE_WANTED;
250*7c478bd9Sstevel@tonic-gate 	}
251*7c478bd9Sstevel@tonic-gate 
252*7c478bd9Sstevel@tonic-gate 	for (;;) {
253*7c478bd9Sstevel@tonic-gate 		if (((old = lp->rw_wwwh) & lock_busy) == 0) {
254*7c478bd9Sstevel@tonic-gate 			if (casip(&lp->rw_wwwh, old, old + lock_value) != old)
255*7c478bd9Sstevel@tonic-gate 				continue;
256*7c478bd9Sstevel@tonic-gate 			break;
257*7c478bd9Sstevel@tonic-gate 		}
258*7c478bd9Sstevel@tonic-gate 
259*7c478bd9Sstevel@tonic-gate 		if (panicstr)
260*7c478bd9Sstevel@tonic-gate 			return;
261*7c478bd9Sstevel@tonic-gate 
262*7c478bd9Sstevel@tonic-gate 		if ((old & RW_DOUBLE_LOCK) == RW_DOUBLE_LOCK) {
263*7c478bd9Sstevel@tonic-gate 			rw_panic("rw_enter: bad rwlock", lp);
264*7c478bd9Sstevel@tonic-gate 			return;
265*7c478bd9Sstevel@tonic-gate 		}
266*7c478bd9Sstevel@tonic-gate 
267*7c478bd9Sstevel@tonic-gate 		if ((old & RW_OWNER) == (uintptr_t)curthread) {
268*7c478bd9Sstevel@tonic-gate 			rw_panic("recursive rw_enter", lp);
269*7c478bd9Sstevel@tonic-gate 			return;
270*7c478bd9Sstevel@tonic-gate 		}
271*7c478bd9Sstevel@tonic-gate 
272*7c478bd9Sstevel@tonic-gate 		ts = turnstile_lookup(lp);
273*7c478bd9Sstevel@tonic-gate 
274*7c478bd9Sstevel@tonic-gate 		do {
275*7c478bd9Sstevel@tonic-gate 			if (((old = lp->rw_wwwh) & lock_busy) == 0)
276*7c478bd9Sstevel@tonic-gate 				break;
277*7c478bd9Sstevel@tonic-gate 			new = old | lock_wait;
278*7c478bd9Sstevel@tonic-gate 		} while (old != new && casip(&lp->rw_wwwh, old, new) != old);
279*7c478bd9Sstevel@tonic-gate 
280*7c478bd9Sstevel@tonic-gate 		if ((old & lock_busy) == 0) {
281*7c478bd9Sstevel@tonic-gate 			/*
282*7c478bd9Sstevel@tonic-gate 			 * The lock appears free now; try the dance again
283*7c478bd9Sstevel@tonic-gate 			 */
284*7c478bd9Sstevel@tonic-gate 			turnstile_exit(lp);
285*7c478bd9Sstevel@tonic-gate 			continue;
286*7c478bd9Sstevel@tonic-gate 		}
287*7c478bd9Sstevel@tonic-gate 
288*7c478bd9Sstevel@tonic-gate 		/*
289*7c478bd9Sstevel@tonic-gate 		 * We really are going to block.  Bump the stats, and drop
290*7c478bd9Sstevel@tonic-gate 		 * kpri if we're a reader.
291*7c478bd9Sstevel@tonic-gate 		 */
292*7c478bd9Sstevel@tonic-gate 		ASSERT(lp->rw_wwwh & lock_wait);
293*7c478bd9Sstevel@tonic-gate 		ASSERT(lp->rw_wwwh & RW_LOCKED);
294*7c478bd9Sstevel@tonic-gate 
295*7c478bd9Sstevel@tonic-gate 		sleep_time = -gethrtime();
296*7c478bd9Sstevel@tonic-gate 		if (rw == RW_READER) {
297*7c478bd9Sstevel@tonic-gate 			THREAD_KPRI_RELEASE();
298*7c478bd9Sstevel@tonic-gate 			CPU_STATS_ADDQ(CPU, sys, rw_rdfails, 1);
299*7c478bd9Sstevel@tonic-gate 			(void) turnstile_block(ts, TS_READER_Q, lp,
300*7c478bd9Sstevel@tonic-gate 			    &rw_sobj_ops, NULL, NULL);
301*7c478bd9Sstevel@tonic-gate 		} else {
302*7c478bd9Sstevel@tonic-gate 			CPU_STATS_ADDQ(CPU, sys, rw_wrfails, 1);
303*7c478bd9Sstevel@tonic-gate 			(void) turnstile_block(ts, TS_WRITER_Q, lp,
304*7c478bd9Sstevel@tonic-gate 			    &rw_sobj_ops, NULL, NULL);
305*7c478bd9Sstevel@tonic-gate 		}
306*7c478bd9Sstevel@tonic-gate 		sleep_time += gethrtime();
307*7c478bd9Sstevel@tonic-gate 
308*7c478bd9Sstevel@tonic-gate 		LOCKSTAT_RECORD4(LS_RW_ENTER_BLOCK, lp, sleep_time, rw,
309*7c478bd9Sstevel@tonic-gate 		    (old & RW_WRITE_LOCKED) ? 1 : 0,
310*7c478bd9Sstevel@tonic-gate 		    old >> RW_HOLD_COUNT_SHIFT);
311*7c478bd9Sstevel@tonic-gate 
312*7c478bd9Sstevel@tonic-gate 		/*
313*7c478bd9Sstevel@tonic-gate 		 * We wake up holding the lock (and having kpri if we're
314*7c478bd9Sstevel@tonic-gate 		 * a reader) via direct handoff from the previous owner.
315*7c478bd9Sstevel@tonic-gate 		 */
316*7c478bd9Sstevel@tonic-gate 		break;
317*7c478bd9Sstevel@tonic-gate 	}
318*7c478bd9Sstevel@tonic-gate 
319*7c478bd9Sstevel@tonic-gate 	ASSERT(rw_locked(lp, rw));
320*7c478bd9Sstevel@tonic-gate 
321*7c478bd9Sstevel@tonic-gate 	membar_enter();
322*7c478bd9Sstevel@tonic-gate 
323*7c478bd9Sstevel@tonic-gate 	LOCKSTAT_RECORD(LS_RW_ENTER_ACQUIRE, lp, rw);
324*7c478bd9Sstevel@tonic-gate }
325*7c478bd9Sstevel@tonic-gate 
326*7c478bd9Sstevel@tonic-gate /*
327*7c478bd9Sstevel@tonic-gate  * Return the number of readers to wake, or zero if we should wake a writer.
328*7c478bd9Sstevel@tonic-gate  * Called only by exiting/downgrading writers (readers don't wake readers).
329*7c478bd9Sstevel@tonic-gate  */
330*7c478bd9Sstevel@tonic-gate static int
331*7c478bd9Sstevel@tonic-gate rw_readers_to_wake(turnstile_t *ts)
332*7c478bd9Sstevel@tonic-gate {
333*7c478bd9Sstevel@tonic-gate 	kthread_t *next_writer = ts->ts_sleepq[TS_WRITER_Q].sq_first;
334*7c478bd9Sstevel@tonic-gate 	kthread_t *next_reader = ts->ts_sleepq[TS_READER_Q].sq_first;
335*7c478bd9Sstevel@tonic-gate 	pri_t wpri = (next_writer != NULL) ? DISP_PRIO(next_writer) : -1;
336*7c478bd9Sstevel@tonic-gate 	int count = 0;
337*7c478bd9Sstevel@tonic-gate 
338*7c478bd9Sstevel@tonic-gate 	while (next_reader != NULL) {
339*7c478bd9Sstevel@tonic-gate 		if (DISP_PRIO(next_reader) < wpri)
340*7c478bd9Sstevel@tonic-gate 			break;
341*7c478bd9Sstevel@tonic-gate 		next_reader->t_kpri_req++;
342*7c478bd9Sstevel@tonic-gate 		next_reader = next_reader->t_link;
343*7c478bd9Sstevel@tonic-gate 		count++;
344*7c478bd9Sstevel@tonic-gate 	}
345*7c478bd9Sstevel@tonic-gate 	return (count);
346*7c478bd9Sstevel@tonic-gate }
347*7c478bd9Sstevel@tonic-gate 
348*7c478bd9Sstevel@tonic-gate /*
349*7c478bd9Sstevel@tonic-gate  * Full-service implementation of rw_exit() to handle all the hard cases.
350*7c478bd9Sstevel@tonic-gate  * Called from the assembly version if anything complicated is going on.
351*7c478bd9Sstevel@tonic-gate  * There is no semantic difference between calling rw_exit() and calling
352*7c478bd9Sstevel@tonic-gate  * rw_exit_wakeup() directly.
353*7c478bd9Sstevel@tonic-gate  */
354*7c478bd9Sstevel@tonic-gate void
355*7c478bd9Sstevel@tonic-gate rw_exit_wakeup(rwlock_impl_t *lp)
356*7c478bd9Sstevel@tonic-gate {
357*7c478bd9Sstevel@tonic-gate 	turnstile_t *ts;
358*7c478bd9Sstevel@tonic-gate 	uintptr_t old, new, lock_value;
359*7c478bd9Sstevel@tonic-gate 	kthread_t *next_writer;
360*7c478bd9Sstevel@tonic-gate 	int nreaders;
361*7c478bd9Sstevel@tonic-gate 
362*7c478bd9Sstevel@tonic-gate 	membar_exit();
363*7c478bd9Sstevel@tonic-gate 
364*7c478bd9Sstevel@tonic-gate 	old = lp->rw_wwwh;
365*7c478bd9Sstevel@tonic-gate 	if (old & RW_WRITE_LOCKED) {
366*7c478bd9Sstevel@tonic-gate 		if ((old & RW_OWNER) != (uintptr_t)curthread) {
367*7c478bd9Sstevel@tonic-gate 			rw_panic("rw_exit: not owner", lp);
368*7c478bd9Sstevel@tonic-gate 			lp->rw_wwwh = 0;
369*7c478bd9Sstevel@tonic-gate 			return;
370*7c478bd9Sstevel@tonic-gate 		}
371*7c478bd9Sstevel@tonic-gate 		lock_value = RW_WRITE_LOCK(curthread);
372*7c478bd9Sstevel@tonic-gate 	} else {
373*7c478bd9Sstevel@tonic-gate 		if ((old & RW_LOCKED) == 0) {
374*7c478bd9Sstevel@tonic-gate 			rw_panic("rw_exit: lock not held", lp);
375*7c478bd9Sstevel@tonic-gate 			return;
376*7c478bd9Sstevel@tonic-gate 		}
377*7c478bd9Sstevel@tonic-gate 		lock_value = RW_READ_LOCK;
378*7c478bd9Sstevel@tonic-gate 	}
379*7c478bd9Sstevel@tonic-gate 
380*7c478bd9Sstevel@tonic-gate 	for (;;) {
381*7c478bd9Sstevel@tonic-gate 		/*
382*7c478bd9Sstevel@tonic-gate 		 * If this is *not* the final exit of a lock with waiters,
383*7c478bd9Sstevel@tonic-gate 		 * just drop the lock -- there's nothing tricky going on.
384*7c478bd9Sstevel@tonic-gate 		 */
385*7c478bd9Sstevel@tonic-gate 		old = lp->rw_wwwh;
386*7c478bd9Sstevel@tonic-gate 		new = old - lock_value;
387*7c478bd9Sstevel@tonic-gate 		if ((new & (RW_LOCKED | RW_HAS_WAITERS)) != RW_HAS_WAITERS) {
388*7c478bd9Sstevel@tonic-gate 			if (casip(&lp->rw_wwwh, old, new) != old)
389*7c478bd9Sstevel@tonic-gate 				continue;
390*7c478bd9Sstevel@tonic-gate 			break;
391*7c478bd9Sstevel@tonic-gate 		}
392*7c478bd9Sstevel@tonic-gate 
393*7c478bd9Sstevel@tonic-gate 		/*
394*7c478bd9Sstevel@tonic-gate 		 * Perform the final exit of a lock that has waiters.
395*7c478bd9Sstevel@tonic-gate 		 */
396*7c478bd9Sstevel@tonic-gate 		ts = turnstile_lookup(lp);
397*7c478bd9Sstevel@tonic-gate 
398*7c478bd9Sstevel@tonic-gate 		next_writer = ts->ts_sleepq[TS_WRITER_Q].sq_first;
399*7c478bd9Sstevel@tonic-gate 
400*7c478bd9Sstevel@tonic-gate 		if ((old & RW_WRITE_LOCKED) &&
401*7c478bd9Sstevel@tonic-gate 		    (nreaders = rw_readers_to_wake(ts)) > 0) {
402*7c478bd9Sstevel@tonic-gate 			/*
403*7c478bd9Sstevel@tonic-gate 			 * Don't drop the lock -- just set the hold count
404*7c478bd9Sstevel@tonic-gate 			 * such that we grant the lock to all readers at once.
405*7c478bd9Sstevel@tonic-gate 			 */
406*7c478bd9Sstevel@tonic-gate 			new = nreaders * RW_READ_LOCK;
407*7c478bd9Sstevel@tonic-gate 			if (ts->ts_waiters > nreaders)
408*7c478bd9Sstevel@tonic-gate 				new |= RW_HAS_WAITERS;
409*7c478bd9Sstevel@tonic-gate 			if (next_writer)
410*7c478bd9Sstevel@tonic-gate 				new |= RW_WRITE_WANTED;
411*7c478bd9Sstevel@tonic-gate 			lp->rw_wwwh = new;
412*7c478bd9Sstevel@tonic-gate 			membar_enter();
413*7c478bd9Sstevel@tonic-gate 			turnstile_wakeup(ts, TS_READER_Q, nreaders, NULL);
414*7c478bd9Sstevel@tonic-gate 		} else {
415*7c478bd9Sstevel@tonic-gate 			/*
416*7c478bd9Sstevel@tonic-gate 			 * Don't drop the lock -- just transfer ownership
417*7c478bd9Sstevel@tonic-gate 			 * directly to next_writer.  Note that there must
418*7c478bd9Sstevel@tonic-gate 			 * be at least one waiting writer, because we get
419*7c478bd9Sstevel@tonic-gate 			 * here only if (A) the lock is read-locked or
420*7c478bd9Sstevel@tonic-gate 			 * (B) there are no waiting readers.  In case (A),
421*7c478bd9Sstevel@tonic-gate 			 * since the lock is read-locked there would be no
422*7c478bd9Sstevel@tonic-gate 			 * reason for other readers to have blocked unless
423*7c478bd9Sstevel@tonic-gate 			 * the RW_WRITE_WANTED bit was set.  In case (B),
424*7c478bd9Sstevel@tonic-gate 			 * since there are waiters but no waiting readers,
425*7c478bd9Sstevel@tonic-gate 			 * they must all be waiting writers.
426*7c478bd9Sstevel@tonic-gate 			 */
427*7c478bd9Sstevel@tonic-gate 			ASSERT(lp->rw_wwwh & RW_WRITE_WANTED);
428*7c478bd9Sstevel@tonic-gate 			new = RW_WRITE_LOCK(next_writer);
429*7c478bd9Sstevel@tonic-gate 			if (ts->ts_waiters > 1)
430*7c478bd9Sstevel@tonic-gate 				new |= RW_HAS_WAITERS;
431*7c478bd9Sstevel@tonic-gate 			if (next_writer->t_link)
432*7c478bd9Sstevel@tonic-gate 				new |= RW_WRITE_WANTED;
433*7c478bd9Sstevel@tonic-gate 			lp->rw_wwwh = new;
434*7c478bd9Sstevel@tonic-gate 			membar_enter();
435*7c478bd9Sstevel@tonic-gate 			turnstile_wakeup(ts, TS_WRITER_Q, 1, next_writer);
436*7c478bd9Sstevel@tonic-gate 		}
437*7c478bd9Sstevel@tonic-gate 		break;
438*7c478bd9Sstevel@tonic-gate 	}
439*7c478bd9Sstevel@tonic-gate 
440*7c478bd9Sstevel@tonic-gate 	if (lock_value == RW_READ_LOCK) {
441*7c478bd9Sstevel@tonic-gate 		THREAD_KPRI_RELEASE();
442*7c478bd9Sstevel@tonic-gate 		LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE, lp, RW_READER);
443*7c478bd9Sstevel@tonic-gate 	} else {
444*7c478bd9Sstevel@tonic-gate 		LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE, lp, RW_WRITER);
445*7c478bd9Sstevel@tonic-gate 	}
446*7c478bd9Sstevel@tonic-gate }
447*7c478bd9Sstevel@tonic-gate 
448*7c478bd9Sstevel@tonic-gate int
449*7c478bd9Sstevel@tonic-gate rw_tryenter(krwlock_t *rwlp, krw_t rw)
450*7c478bd9Sstevel@tonic-gate {
451*7c478bd9Sstevel@tonic-gate 	rwlock_impl_t *lp = (rwlock_impl_t *)rwlp;
452*7c478bd9Sstevel@tonic-gate 	uintptr_t old;
453*7c478bd9Sstevel@tonic-gate 
454*7c478bd9Sstevel@tonic-gate 	if (rw == RW_READER) {
455*7c478bd9Sstevel@tonic-gate 		THREAD_KPRI_REQUEST();
456*7c478bd9Sstevel@tonic-gate 		do {
457*7c478bd9Sstevel@tonic-gate 			if ((old = lp->rw_wwwh) & RW_WRITE_CLAIMED) {
458*7c478bd9Sstevel@tonic-gate 				THREAD_KPRI_RELEASE();
459*7c478bd9Sstevel@tonic-gate 				return (0);
460*7c478bd9Sstevel@tonic-gate 			}
461*7c478bd9Sstevel@tonic-gate 		} while (casip(&lp->rw_wwwh, old, old + RW_READ_LOCK) != old);
462*7c478bd9Sstevel@tonic-gate 		LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE, lp, rw);
463*7c478bd9Sstevel@tonic-gate 	} else {
464*7c478bd9Sstevel@tonic-gate 		if (casip(&lp->rw_wwwh, 0, RW_WRITE_LOCK(curthread)) != 0)
465*7c478bd9Sstevel@tonic-gate 			return (0);
466*7c478bd9Sstevel@tonic-gate 		LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE, lp, rw);
467*7c478bd9Sstevel@tonic-gate 	}
468*7c478bd9Sstevel@tonic-gate 	ASSERT(rw_locked(lp, rw));
469*7c478bd9Sstevel@tonic-gate 	membar_enter();
470*7c478bd9Sstevel@tonic-gate 	return (1);
471*7c478bd9Sstevel@tonic-gate }
472*7c478bd9Sstevel@tonic-gate 
473*7c478bd9Sstevel@tonic-gate void
474*7c478bd9Sstevel@tonic-gate rw_downgrade(krwlock_t *rwlp)
475*7c478bd9Sstevel@tonic-gate {
476*7c478bd9Sstevel@tonic-gate 	rwlock_impl_t *lp = (rwlock_impl_t *)rwlp;
477*7c478bd9Sstevel@tonic-gate 
478*7c478bd9Sstevel@tonic-gate 	THREAD_KPRI_REQUEST();
479*7c478bd9Sstevel@tonic-gate 	membar_exit();
480*7c478bd9Sstevel@tonic-gate 
481*7c478bd9Sstevel@tonic-gate 	if ((lp->rw_wwwh & RW_OWNER) != (uintptr_t)curthread) {
482*7c478bd9Sstevel@tonic-gate 		rw_panic("rw_downgrade: not owner", lp);
483*7c478bd9Sstevel@tonic-gate 		return;
484*7c478bd9Sstevel@tonic-gate 	}
485*7c478bd9Sstevel@tonic-gate 
486*7c478bd9Sstevel@tonic-gate 	if (atomic_add_ip_nv(&lp->rw_wwwh,
487*7c478bd9Sstevel@tonic-gate 	    RW_READ_LOCK - RW_WRITE_LOCK(curthread)) & RW_HAS_WAITERS) {
488*7c478bd9Sstevel@tonic-gate 		turnstile_t *ts = turnstile_lookup(lp);
489*7c478bd9Sstevel@tonic-gate 		int nreaders = rw_readers_to_wake(ts);
490*7c478bd9Sstevel@tonic-gate 		if (nreaders > 0) {
491*7c478bd9Sstevel@tonic-gate 			uintptr_t delta = nreaders * RW_READ_LOCK;
492*7c478bd9Sstevel@tonic-gate 			if (ts->ts_waiters == nreaders)
493*7c478bd9Sstevel@tonic-gate 				delta -= RW_HAS_WAITERS;
494*7c478bd9Sstevel@tonic-gate 			atomic_add_ip(&lp->rw_wwwh, delta);
495*7c478bd9Sstevel@tonic-gate 		}
496*7c478bd9Sstevel@tonic-gate 		turnstile_wakeup(ts, TS_READER_Q, nreaders, NULL);
497*7c478bd9Sstevel@tonic-gate 	}
498*7c478bd9Sstevel@tonic-gate 	ASSERT(rw_locked(lp, RW_READER));
499*7c478bd9Sstevel@tonic-gate 	LOCKSTAT_RECORD0(LS_RW_DOWNGRADE_DOWNGRADE, lp);
500*7c478bd9Sstevel@tonic-gate }
501*7c478bd9Sstevel@tonic-gate 
502*7c478bd9Sstevel@tonic-gate int
503*7c478bd9Sstevel@tonic-gate rw_tryupgrade(krwlock_t *rwlp)
504*7c478bd9Sstevel@tonic-gate {
505*7c478bd9Sstevel@tonic-gate 	rwlock_impl_t *lp = (rwlock_impl_t *)rwlp;
506*7c478bd9Sstevel@tonic-gate 	uintptr_t old, new;
507*7c478bd9Sstevel@tonic-gate 
508*7c478bd9Sstevel@tonic-gate 	ASSERT(rw_locked(lp, RW_READER));
509*7c478bd9Sstevel@tonic-gate 
510*7c478bd9Sstevel@tonic-gate 	do {
511*7c478bd9Sstevel@tonic-gate 		if (((old = lp->rw_wwwh) & ~RW_HAS_WAITERS) != RW_READ_LOCK)
512*7c478bd9Sstevel@tonic-gate 			return (0);
513*7c478bd9Sstevel@tonic-gate 		new = old + RW_WRITE_LOCK(curthread) - RW_READ_LOCK;
514*7c478bd9Sstevel@tonic-gate 	} while (casip(&lp->rw_wwwh, old, new) != old);
515*7c478bd9Sstevel@tonic-gate 
516*7c478bd9Sstevel@tonic-gate 	membar_enter();
517*7c478bd9Sstevel@tonic-gate 	THREAD_KPRI_RELEASE();
518*7c478bd9Sstevel@tonic-gate 	LOCKSTAT_RECORD0(LS_RW_TRYUPGRADE_UPGRADE, lp);
519*7c478bd9Sstevel@tonic-gate 	ASSERT(rw_locked(lp, RW_WRITER));
520*7c478bd9Sstevel@tonic-gate 	return (1);
521*7c478bd9Sstevel@tonic-gate }
522*7c478bd9Sstevel@tonic-gate 
523*7c478bd9Sstevel@tonic-gate int
524*7c478bd9Sstevel@tonic-gate rw_read_held(krwlock_t *rwlp)
525*7c478bd9Sstevel@tonic-gate {
526*7c478bd9Sstevel@tonic-gate 	uintptr_t tmp;
527*7c478bd9Sstevel@tonic-gate 
528*7c478bd9Sstevel@tonic-gate 	return (_RW_READ_HELD(rwlp, tmp));
529*7c478bd9Sstevel@tonic-gate }
530*7c478bd9Sstevel@tonic-gate 
531*7c478bd9Sstevel@tonic-gate int
532*7c478bd9Sstevel@tonic-gate rw_write_held(krwlock_t *rwlp)
533*7c478bd9Sstevel@tonic-gate {
534*7c478bd9Sstevel@tonic-gate 	return (_RW_WRITE_HELD(rwlp));
535*7c478bd9Sstevel@tonic-gate }
536*7c478bd9Sstevel@tonic-gate 
537*7c478bd9Sstevel@tonic-gate int
538*7c478bd9Sstevel@tonic-gate rw_lock_held(krwlock_t *rwlp)
539*7c478bd9Sstevel@tonic-gate {
540*7c478bd9Sstevel@tonic-gate 	return (_RW_LOCK_HELD(rwlp));
541*7c478bd9Sstevel@tonic-gate }
542*7c478bd9Sstevel@tonic-gate 
543*7c478bd9Sstevel@tonic-gate /*
544*7c478bd9Sstevel@tonic-gate  * Like rw_read_held(), but ASSERTs that the lock is currently held
545*7c478bd9Sstevel@tonic-gate  */
546*7c478bd9Sstevel@tonic-gate int
547*7c478bd9Sstevel@tonic-gate rw_read_locked(krwlock_t *rwlp)
548*7c478bd9Sstevel@tonic-gate {
549*7c478bd9Sstevel@tonic-gate 	uintptr_t old = ((rwlock_impl_t *)rwlp)->rw_wwwh;
550*7c478bd9Sstevel@tonic-gate 
551*7c478bd9Sstevel@tonic-gate 	ASSERT(old & RW_LOCKED);
552*7c478bd9Sstevel@tonic-gate 	return ((old & RW_LOCKED) && !(old & RW_WRITE_LOCKED));
553*7c478bd9Sstevel@tonic-gate }
554*7c478bd9Sstevel@tonic-gate 
555*7c478bd9Sstevel@tonic-gate /*
556*7c478bd9Sstevel@tonic-gate  * Returns non-zero if the lock is either held or desired by a writer
557*7c478bd9Sstevel@tonic-gate  */
558*7c478bd9Sstevel@tonic-gate int
559*7c478bd9Sstevel@tonic-gate rw_iswriter(krwlock_t *rwlp)
560*7c478bd9Sstevel@tonic-gate {
561*7c478bd9Sstevel@tonic-gate 	return (_RW_ISWRITER(rwlp));
562*7c478bd9Sstevel@tonic-gate }
563*7c478bd9Sstevel@tonic-gate 
564*7c478bd9Sstevel@tonic-gate kthread_t *
565*7c478bd9Sstevel@tonic-gate rw_owner(krwlock_t *rwlp)
566*7c478bd9Sstevel@tonic-gate {
567*7c478bd9Sstevel@tonic-gate 	uintptr_t old = ((rwlock_impl_t *)rwlp)->rw_wwwh;
568*7c478bd9Sstevel@tonic-gate 
569*7c478bd9Sstevel@tonic-gate 	return ((old & RW_WRITE_LOCKED) ? (kthread_t *)(old & RW_OWNER) : NULL);
570*7c478bd9Sstevel@tonic-gate }
571