1*7c478bd9Sstevel@tonic-gate /* 2*7c478bd9Sstevel@tonic-gate * CDDL HEADER START 3*7c478bd9Sstevel@tonic-gate * 4*7c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*7c478bd9Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 6*7c478bd9Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 7*7c478bd9Sstevel@tonic-gate * with the License. 8*7c478bd9Sstevel@tonic-gate * 9*7c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*7c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 11*7c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 12*7c478bd9Sstevel@tonic-gate * and limitations under the License. 13*7c478bd9Sstevel@tonic-gate * 14*7c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 15*7c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*7c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 17*7c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 18*7c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 19*7c478bd9Sstevel@tonic-gate * 20*7c478bd9Sstevel@tonic-gate * CDDL HEADER END 21*7c478bd9Sstevel@tonic-gate */ 22*7c478bd9Sstevel@tonic-gate /* 23*7c478bd9Sstevel@tonic-gate * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24*7c478bd9Sstevel@tonic-gate * Use is subject to license terms. 25*7c478bd9Sstevel@tonic-gate */ 26*7c478bd9Sstevel@tonic-gate 27*7c478bd9Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 28*7c478bd9Sstevel@tonic-gate 29*7c478bd9Sstevel@tonic-gate #include <sys/param.h> 30*7c478bd9Sstevel@tonic-gate #include <sys/thread.h> 31*7c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 32*7c478bd9Sstevel@tonic-gate #include <sys/debug.h> 33*7c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h> 34*7c478bd9Sstevel@tonic-gate #include <sys/sobject.h> 35*7c478bd9Sstevel@tonic-gate #include <sys/turnstile.h> 36*7c478bd9Sstevel@tonic-gate #include <sys/rwlock.h> 37*7c478bd9Sstevel@tonic-gate #include <sys/rwlock_impl.h> 38*7c478bd9Sstevel@tonic-gate #include <sys/atomic.h> 39*7c478bd9Sstevel@tonic-gate #include <sys/lockstat.h> 40*7c478bd9Sstevel@tonic-gate 41*7c478bd9Sstevel@tonic-gate /* 42*7c478bd9Sstevel@tonic-gate * Big Theory Statement for readers/writer locking primitives. 43*7c478bd9Sstevel@tonic-gate * 44*7c478bd9Sstevel@tonic-gate * An rwlock provides exclusive access to a single thread ("writer") or 45*7c478bd9Sstevel@tonic-gate * concurrent access to multiple threads ("readers"). See rwlock(9F) 46*7c478bd9Sstevel@tonic-gate * for a full description of the interfaces and programming model. 47*7c478bd9Sstevel@tonic-gate * The rest of this comment describes the implementation. 48*7c478bd9Sstevel@tonic-gate * 49*7c478bd9Sstevel@tonic-gate * An rwlock is a single word with the following structure: 50*7c478bd9Sstevel@tonic-gate * 51*7c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------- 52*7c478bd9Sstevel@tonic-gate * | OWNER (writer) or HOLD COUNT (readers) | WRLOCK | WRWANT | WAIT | 53*7c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------- 54*7c478bd9Sstevel@tonic-gate * 63 / 31 .. 3 2 1 0 55*7c478bd9Sstevel@tonic-gate * 56*7c478bd9Sstevel@tonic-gate * The waiters bit (0) indicates whether any threads are blocked waiting 57*7c478bd9Sstevel@tonic-gate * for the lock. The write-wanted bit (1) indicates whether any threads 58*7c478bd9Sstevel@tonic-gate * are blocked waiting for write access. The write-locked bit (2) indicates 59*7c478bd9Sstevel@tonic-gate * whether the lock is held by a writer, which determines whether the upper 60*7c478bd9Sstevel@tonic-gate * bits (3..31 in ILP32, 3..63 in LP64) should be interpreted as the owner 61*7c478bd9Sstevel@tonic-gate * (thread pointer) or the hold count (number of readers). 62*7c478bd9Sstevel@tonic-gate * 63*7c478bd9Sstevel@tonic-gate * In the absence of any contention, a writer gets the lock by setting 64*7c478bd9Sstevel@tonic-gate * this word to (curthread | RW_WRITE_LOCKED); a reader gets the lock 65*7c478bd9Sstevel@tonic-gate * by incrementing the hold count (i.e. adding 8, aka RW_READ_LOCK). 66*7c478bd9Sstevel@tonic-gate * 67*7c478bd9Sstevel@tonic-gate * A writer will fail to acquire the lock if any other thread owns it. 68*7c478bd9Sstevel@tonic-gate * A reader will fail if the lock is either owned or wanted by a writer. 69*7c478bd9Sstevel@tonic-gate * rw_tryenter() returns 0 in these cases; rw_enter() blocks until the 70*7c478bd9Sstevel@tonic-gate * lock becomes available. 71*7c478bd9Sstevel@tonic-gate * 72*7c478bd9Sstevel@tonic-gate * When a thread blocks it acquires the rwlock's hashed turnstile lock and 73*7c478bd9Sstevel@tonic-gate * attempts to set RW_HAS_WAITERS (and RW_WRITE_WANTED in the writer case) 74*7c478bd9Sstevel@tonic-gate * atomically *only if the lock still appears busy*. A thread must never 75*7c478bd9Sstevel@tonic-gate * accidentally block for an available lock since there would be no owner 76*7c478bd9Sstevel@tonic-gate * to awaken it. casip() provides the required atomicity. Once casip() 77*7c478bd9Sstevel@tonic-gate * succeeds, the decision to block becomes final and irreversible. The 78*7c478bd9Sstevel@tonic-gate * thread will not become runnable again until it has been granted ownership 79*7c478bd9Sstevel@tonic-gate * of the lock via direct handoff from a former owner as described below. 80*7c478bd9Sstevel@tonic-gate * 81*7c478bd9Sstevel@tonic-gate * In the absence of any waiters, rw_exit() just clears the lock (if it 82*7c478bd9Sstevel@tonic-gate * is write-locked) or decrements the hold count (if it is read-locked). 83*7c478bd9Sstevel@tonic-gate * Note that even if waiters are present, decrementing the hold count 84*7c478bd9Sstevel@tonic-gate * to a non-zero value requires no special action since the lock is still 85*7c478bd9Sstevel@tonic-gate * held by at least one other thread. 86*7c478bd9Sstevel@tonic-gate * 87*7c478bd9Sstevel@tonic-gate * On the "final exit" (transition to unheld state) of a lock with waiters, 88*7c478bd9Sstevel@tonic-gate * rw_exit_wakeup() grabs the turnstile lock and transfers ownership directly 89*7c478bd9Sstevel@tonic-gate * to the next writer or set of readers. There are several advantages to this 90*7c478bd9Sstevel@tonic-gate * approach: (1) it closes all windows for priority inversion (when a new 91*7c478bd9Sstevel@tonic-gate * writer has grabbed the lock but has not yet inherited from blocked readers); 92*7c478bd9Sstevel@tonic-gate * (2) it prevents starvation of equal-priority threads by granting the lock 93*7c478bd9Sstevel@tonic-gate * in FIFO order; (3) it eliminates the need for a write-wanted count -- a 94*7c478bd9Sstevel@tonic-gate * single bit suffices because the lock remains held until all waiting 95*7c478bd9Sstevel@tonic-gate * writers are gone; (4) when we awaken N readers we can perform a single 96*7c478bd9Sstevel@tonic-gate * "atomic_add(&x, N)" to set the total hold count rather than having all N 97*7c478bd9Sstevel@tonic-gate * threads fight for the cache to perform an "atomic_add(&x, 1)" upon wakeup. 98*7c478bd9Sstevel@tonic-gate * 99*7c478bd9Sstevel@tonic-gate * The most interesting policy decision in rw_exit_wakeup() is which thread 100*7c478bd9Sstevel@tonic-gate * to wake. Starvation is always possible with priority-based scheduling, 101*7c478bd9Sstevel@tonic-gate * but any sane wakeup policy should at least satisfy these requirements: 102*7c478bd9Sstevel@tonic-gate * 103*7c478bd9Sstevel@tonic-gate * (1) The highest-priority thread in the system should not starve. 104*7c478bd9Sstevel@tonic-gate * (2) The highest-priority writer should not starve. 105*7c478bd9Sstevel@tonic-gate * (3) No writer should starve due to lower-priority threads. 106*7c478bd9Sstevel@tonic-gate * (4) No reader should starve due to lower-priority writers. 107*7c478bd9Sstevel@tonic-gate * (5) If all threads have equal priority, none of them should starve. 108*7c478bd9Sstevel@tonic-gate * 109*7c478bd9Sstevel@tonic-gate * We used to employ a writers-always-win policy, which doesn't even 110*7c478bd9Sstevel@tonic-gate * satisfy (1): a steady stream of low-priority writers can starve out 111*7c478bd9Sstevel@tonic-gate * a real-time reader! This is clearly a broken policy -- it violates 112*7c478bd9Sstevel@tonic-gate * (1), (4), and (5) -- but it's how rwlocks always used to behave. 113*7c478bd9Sstevel@tonic-gate * 114*7c478bd9Sstevel@tonic-gate * A round-robin policy (exiting readers grant the lock to blocked writers 115*7c478bd9Sstevel@tonic-gate * and vice versa) satisfies all but (3): a single high-priority writer 116*7c478bd9Sstevel@tonic-gate * and many low-priority readers can starve out medium-priority writers. 117*7c478bd9Sstevel@tonic-gate * 118*7c478bd9Sstevel@tonic-gate * A strict priority policy (grant the lock to the highest priority blocked 119*7c478bd9Sstevel@tonic-gate * thread) satisfies everything but (2): a steady stream of high-priority 120*7c478bd9Sstevel@tonic-gate * readers can permanently starve the highest-priority writer. 121*7c478bd9Sstevel@tonic-gate * 122*7c478bd9Sstevel@tonic-gate * The reason we care about (2) is that it's important to process writers 123*7c478bd9Sstevel@tonic-gate * reasonably quickly -- even if they're low priority -- because their very 124*7c478bd9Sstevel@tonic-gate * presence causes all readers to take the slow (blocking) path through this 125*7c478bd9Sstevel@tonic-gate * code. There is also a general sense that writers deserve some degree of 126*7c478bd9Sstevel@tonic-gate * deference because they're updating the data upon which all readers act. 127*7c478bd9Sstevel@tonic-gate * Presumably this data should not be allowed to become arbitrarily stale 128*7c478bd9Sstevel@tonic-gate * due to writer starvation. Finally, it seems reasonable to level the 129*7c478bd9Sstevel@tonic-gate * playing field a bit to compensate for the fact that it's so much harder 130*7c478bd9Sstevel@tonic-gate * for a writer to get in when there are already many readers present. 131*7c478bd9Sstevel@tonic-gate * 132*7c478bd9Sstevel@tonic-gate * A hybrid of round-robin and strict priority can be made to satisfy 133*7c478bd9Sstevel@tonic-gate * all five criteria. In this "writer priority policy" exiting readers 134*7c478bd9Sstevel@tonic-gate * always grant the lock to waiting writers, but exiting writers only 135*7c478bd9Sstevel@tonic-gate * grant the lock to readers of the same or higher priority than the 136*7c478bd9Sstevel@tonic-gate * highest-priority blocked writer. Thus requirement (2) is satisfied, 137*7c478bd9Sstevel@tonic-gate * necessarily, by a willful act of priority inversion: an exiting reader 138*7c478bd9Sstevel@tonic-gate * will grant the lock to a blocked writer even if there are blocked 139*7c478bd9Sstevel@tonic-gate * readers of higher priority. The situation is mitigated by the fact 140*7c478bd9Sstevel@tonic-gate * that writers always inherit priority from blocked readers, and the 141*7c478bd9Sstevel@tonic-gate * writer will awaken those readers as soon as it exits the lock. 142*7c478bd9Sstevel@tonic-gate * 143*7c478bd9Sstevel@tonic-gate * rw_downgrade() follows the same wakeup policy as an exiting writer. 144*7c478bd9Sstevel@tonic-gate * 145*7c478bd9Sstevel@tonic-gate * rw_tryupgrade() has the same failure mode as rw_tryenter() for a 146*7c478bd9Sstevel@tonic-gate * write lock. Both honor the WRITE_WANTED bit by specification. 147*7c478bd9Sstevel@tonic-gate * 148*7c478bd9Sstevel@tonic-gate * The following rules apply to manipulation of rwlock internal state: 149*7c478bd9Sstevel@tonic-gate * 150*7c478bd9Sstevel@tonic-gate * (1) The rwlock is only modified via the atomic primitives casip() 151*7c478bd9Sstevel@tonic-gate * and atomic_add_ip(). 152*7c478bd9Sstevel@tonic-gate * 153*7c478bd9Sstevel@tonic-gate * (2) The waiters bit and write-wanted bit are only modified under 154*7c478bd9Sstevel@tonic-gate * turnstile_lookup(). This ensures that the turnstile is consistent 155*7c478bd9Sstevel@tonic-gate * with the rwlock. 156*7c478bd9Sstevel@tonic-gate * 157*7c478bd9Sstevel@tonic-gate * (3) Waiters receive the lock by direct handoff from the previous 158*7c478bd9Sstevel@tonic-gate * owner. Therefore, waiters *always* wake up holding the lock. 159*7c478bd9Sstevel@tonic-gate */ 160*7c478bd9Sstevel@tonic-gate 161*7c478bd9Sstevel@tonic-gate /* 162*7c478bd9Sstevel@tonic-gate * The sobj_ops vector exports a set of functions needed when a thread 163*7c478bd9Sstevel@tonic-gate * is asleep on a synchronization object of a given type. 164*7c478bd9Sstevel@tonic-gate */ 165*7c478bd9Sstevel@tonic-gate static sobj_ops_t rw_sobj_ops = { 166*7c478bd9Sstevel@tonic-gate SOBJ_RWLOCK, rw_owner, turnstile_stay_asleep, turnstile_change_pri 167*7c478bd9Sstevel@tonic-gate }; 168*7c478bd9Sstevel@tonic-gate 169*7c478bd9Sstevel@tonic-gate /* 170*7c478bd9Sstevel@tonic-gate * If the system panics on an rwlock, save the address of the offending 171*7c478bd9Sstevel@tonic-gate * rwlock in panic_rwlock_addr, and save the contents in panic_rwlock. 172*7c478bd9Sstevel@tonic-gate */ 173*7c478bd9Sstevel@tonic-gate static rwlock_impl_t panic_rwlock; 174*7c478bd9Sstevel@tonic-gate static rwlock_impl_t *panic_rwlock_addr; 175*7c478bd9Sstevel@tonic-gate 176*7c478bd9Sstevel@tonic-gate static void 177*7c478bd9Sstevel@tonic-gate rw_panic(char *msg, rwlock_impl_t *lp) 178*7c478bd9Sstevel@tonic-gate { 179*7c478bd9Sstevel@tonic-gate if (panicstr) 180*7c478bd9Sstevel@tonic-gate return; 181*7c478bd9Sstevel@tonic-gate 182*7c478bd9Sstevel@tonic-gate if (casptr(&panic_rwlock_addr, NULL, lp) == NULL) 183*7c478bd9Sstevel@tonic-gate panic_rwlock = *lp; 184*7c478bd9Sstevel@tonic-gate 185*7c478bd9Sstevel@tonic-gate panic("%s, lp=%p wwwh=%lx thread=%p", 186*7c478bd9Sstevel@tonic-gate msg, lp, panic_rwlock.rw_wwwh, curthread); 187*7c478bd9Sstevel@tonic-gate } 188*7c478bd9Sstevel@tonic-gate 189*7c478bd9Sstevel@tonic-gate /* ARGSUSED */ 190*7c478bd9Sstevel@tonic-gate void 191*7c478bd9Sstevel@tonic-gate rw_init(krwlock_t *rwlp, char *name, krw_type_t type, void *arg) 192*7c478bd9Sstevel@tonic-gate { 193*7c478bd9Sstevel@tonic-gate ((rwlock_impl_t *)rwlp)->rw_wwwh = 0; 194*7c478bd9Sstevel@tonic-gate } 195*7c478bd9Sstevel@tonic-gate 196*7c478bd9Sstevel@tonic-gate void 197*7c478bd9Sstevel@tonic-gate rw_destroy(krwlock_t *rwlp) 198*7c478bd9Sstevel@tonic-gate { 199*7c478bd9Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 200*7c478bd9Sstevel@tonic-gate 201*7c478bd9Sstevel@tonic-gate if (lp->rw_wwwh != 0) { 202*7c478bd9Sstevel@tonic-gate if ((lp->rw_wwwh & RW_DOUBLE_LOCK) == RW_DOUBLE_LOCK) 203*7c478bd9Sstevel@tonic-gate rw_panic("rw_destroy: lock already destroyed", lp); 204*7c478bd9Sstevel@tonic-gate else 205*7c478bd9Sstevel@tonic-gate rw_panic("rw_destroy: lock still active", lp); 206*7c478bd9Sstevel@tonic-gate } 207*7c478bd9Sstevel@tonic-gate 208*7c478bd9Sstevel@tonic-gate lp->rw_wwwh = RW_DOUBLE_LOCK; 209*7c478bd9Sstevel@tonic-gate } 210*7c478bd9Sstevel@tonic-gate 211*7c478bd9Sstevel@tonic-gate /* 212*7c478bd9Sstevel@tonic-gate * Verify that an rwlock is held correctly. 213*7c478bd9Sstevel@tonic-gate */ 214*7c478bd9Sstevel@tonic-gate static int 215*7c478bd9Sstevel@tonic-gate rw_locked(rwlock_impl_t *lp, krw_t rw) 216*7c478bd9Sstevel@tonic-gate { 217*7c478bd9Sstevel@tonic-gate uintptr_t old = lp->rw_wwwh; 218*7c478bd9Sstevel@tonic-gate 219*7c478bd9Sstevel@tonic-gate if (rw == RW_READER) 220*7c478bd9Sstevel@tonic-gate return ((old & RW_LOCKED) && !(old & RW_WRITE_LOCKED)); 221*7c478bd9Sstevel@tonic-gate 222*7c478bd9Sstevel@tonic-gate if (rw == RW_WRITER) 223*7c478bd9Sstevel@tonic-gate return ((old & RW_OWNER) == (uintptr_t)curthread); 224*7c478bd9Sstevel@tonic-gate 225*7c478bd9Sstevel@tonic-gate return (0); 226*7c478bd9Sstevel@tonic-gate } 227*7c478bd9Sstevel@tonic-gate 228*7c478bd9Sstevel@tonic-gate /* 229*7c478bd9Sstevel@tonic-gate * Full-service implementation of rw_enter() to handle all the hard cases. 230*7c478bd9Sstevel@tonic-gate * Called from the assembly version if anything complicated is going on. 231*7c478bd9Sstevel@tonic-gate * The only semantic difference between calling rw_enter() and calling 232*7c478bd9Sstevel@tonic-gate * rw_enter_sleep() directly is that we assume the caller has already done 233*7c478bd9Sstevel@tonic-gate * a THREAD_KPRI_REQUEST() in the RW_READER case. 234*7c478bd9Sstevel@tonic-gate */ 235*7c478bd9Sstevel@tonic-gate void 236*7c478bd9Sstevel@tonic-gate rw_enter_sleep(rwlock_impl_t *lp, krw_t rw) 237*7c478bd9Sstevel@tonic-gate { 238*7c478bd9Sstevel@tonic-gate uintptr_t old, new, lock_value, lock_busy, lock_wait; 239*7c478bd9Sstevel@tonic-gate hrtime_t sleep_time; 240*7c478bd9Sstevel@tonic-gate turnstile_t *ts; 241*7c478bd9Sstevel@tonic-gate 242*7c478bd9Sstevel@tonic-gate if (rw == RW_READER) { 243*7c478bd9Sstevel@tonic-gate lock_value = RW_READ_LOCK; 244*7c478bd9Sstevel@tonic-gate lock_busy = RW_WRITE_CLAIMED; 245*7c478bd9Sstevel@tonic-gate lock_wait = RW_HAS_WAITERS; 246*7c478bd9Sstevel@tonic-gate } else { 247*7c478bd9Sstevel@tonic-gate lock_value = RW_WRITE_LOCK(curthread); 248*7c478bd9Sstevel@tonic-gate lock_busy = (uintptr_t)RW_LOCKED; 249*7c478bd9Sstevel@tonic-gate lock_wait = RW_HAS_WAITERS | RW_WRITE_WANTED; 250*7c478bd9Sstevel@tonic-gate } 251*7c478bd9Sstevel@tonic-gate 252*7c478bd9Sstevel@tonic-gate for (;;) { 253*7c478bd9Sstevel@tonic-gate if (((old = lp->rw_wwwh) & lock_busy) == 0) { 254*7c478bd9Sstevel@tonic-gate if (casip(&lp->rw_wwwh, old, old + lock_value) != old) 255*7c478bd9Sstevel@tonic-gate continue; 256*7c478bd9Sstevel@tonic-gate break; 257*7c478bd9Sstevel@tonic-gate } 258*7c478bd9Sstevel@tonic-gate 259*7c478bd9Sstevel@tonic-gate if (panicstr) 260*7c478bd9Sstevel@tonic-gate return; 261*7c478bd9Sstevel@tonic-gate 262*7c478bd9Sstevel@tonic-gate if ((old & RW_DOUBLE_LOCK) == RW_DOUBLE_LOCK) { 263*7c478bd9Sstevel@tonic-gate rw_panic("rw_enter: bad rwlock", lp); 264*7c478bd9Sstevel@tonic-gate return; 265*7c478bd9Sstevel@tonic-gate } 266*7c478bd9Sstevel@tonic-gate 267*7c478bd9Sstevel@tonic-gate if ((old & RW_OWNER) == (uintptr_t)curthread) { 268*7c478bd9Sstevel@tonic-gate rw_panic("recursive rw_enter", lp); 269*7c478bd9Sstevel@tonic-gate return; 270*7c478bd9Sstevel@tonic-gate } 271*7c478bd9Sstevel@tonic-gate 272*7c478bd9Sstevel@tonic-gate ts = turnstile_lookup(lp); 273*7c478bd9Sstevel@tonic-gate 274*7c478bd9Sstevel@tonic-gate do { 275*7c478bd9Sstevel@tonic-gate if (((old = lp->rw_wwwh) & lock_busy) == 0) 276*7c478bd9Sstevel@tonic-gate break; 277*7c478bd9Sstevel@tonic-gate new = old | lock_wait; 278*7c478bd9Sstevel@tonic-gate } while (old != new && casip(&lp->rw_wwwh, old, new) != old); 279*7c478bd9Sstevel@tonic-gate 280*7c478bd9Sstevel@tonic-gate if ((old & lock_busy) == 0) { 281*7c478bd9Sstevel@tonic-gate /* 282*7c478bd9Sstevel@tonic-gate * The lock appears free now; try the dance again 283*7c478bd9Sstevel@tonic-gate */ 284*7c478bd9Sstevel@tonic-gate turnstile_exit(lp); 285*7c478bd9Sstevel@tonic-gate continue; 286*7c478bd9Sstevel@tonic-gate } 287*7c478bd9Sstevel@tonic-gate 288*7c478bd9Sstevel@tonic-gate /* 289*7c478bd9Sstevel@tonic-gate * We really are going to block. Bump the stats, and drop 290*7c478bd9Sstevel@tonic-gate * kpri if we're a reader. 291*7c478bd9Sstevel@tonic-gate */ 292*7c478bd9Sstevel@tonic-gate ASSERT(lp->rw_wwwh & lock_wait); 293*7c478bd9Sstevel@tonic-gate ASSERT(lp->rw_wwwh & RW_LOCKED); 294*7c478bd9Sstevel@tonic-gate 295*7c478bd9Sstevel@tonic-gate sleep_time = -gethrtime(); 296*7c478bd9Sstevel@tonic-gate if (rw == RW_READER) { 297*7c478bd9Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 298*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(CPU, sys, rw_rdfails, 1); 299*7c478bd9Sstevel@tonic-gate (void) turnstile_block(ts, TS_READER_Q, lp, 300*7c478bd9Sstevel@tonic-gate &rw_sobj_ops, NULL, NULL); 301*7c478bd9Sstevel@tonic-gate } else { 302*7c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(CPU, sys, rw_wrfails, 1); 303*7c478bd9Sstevel@tonic-gate (void) turnstile_block(ts, TS_WRITER_Q, lp, 304*7c478bd9Sstevel@tonic-gate &rw_sobj_ops, NULL, NULL); 305*7c478bd9Sstevel@tonic-gate } 306*7c478bd9Sstevel@tonic-gate sleep_time += gethrtime(); 307*7c478bd9Sstevel@tonic-gate 308*7c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD4(LS_RW_ENTER_BLOCK, lp, sleep_time, rw, 309*7c478bd9Sstevel@tonic-gate (old & RW_WRITE_LOCKED) ? 1 : 0, 310*7c478bd9Sstevel@tonic-gate old >> RW_HOLD_COUNT_SHIFT); 311*7c478bd9Sstevel@tonic-gate 312*7c478bd9Sstevel@tonic-gate /* 313*7c478bd9Sstevel@tonic-gate * We wake up holding the lock (and having kpri if we're 314*7c478bd9Sstevel@tonic-gate * a reader) via direct handoff from the previous owner. 315*7c478bd9Sstevel@tonic-gate */ 316*7c478bd9Sstevel@tonic-gate break; 317*7c478bd9Sstevel@tonic-gate } 318*7c478bd9Sstevel@tonic-gate 319*7c478bd9Sstevel@tonic-gate ASSERT(rw_locked(lp, rw)); 320*7c478bd9Sstevel@tonic-gate 321*7c478bd9Sstevel@tonic-gate membar_enter(); 322*7c478bd9Sstevel@tonic-gate 323*7c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_ENTER_ACQUIRE, lp, rw); 324*7c478bd9Sstevel@tonic-gate } 325*7c478bd9Sstevel@tonic-gate 326*7c478bd9Sstevel@tonic-gate /* 327*7c478bd9Sstevel@tonic-gate * Return the number of readers to wake, or zero if we should wake a writer. 328*7c478bd9Sstevel@tonic-gate * Called only by exiting/downgrading writers (readers don't wake readers). 329*7c478bd9Sstevel@tonic-gate */ 330*7c478bd9Sstevel@tonic-gate static int 331*7c478bd9Sstevel@tonic-gate rw_readers_to_wake(turnstile_t *ts) 332*7c478bd9Sstevel@tonic-gate { 333*7c478bd9Sstevel@tonic-gate kthread_t *next_writer = ts->ts_sleepq[TS_WRITER_Q].sq_first; 334*7c478bd9Sstevel@tonic-gate kthread_t *next_reader = ts->ts_sleepq[TS_READER_Q].sq_first; 335*7c478bd9Sstevel@tonic-gate pri_t wpri = (next_writer != NULL) ? DISP_PRIO(next_writer) : -1; 336*7c478bd9Sstevel@tonic-gate int count = 0; 337*7c478bd9Sstevel@tonic-gate 338*7c478bd9Sstevel@tonic-gate while (next_reader != NULL) { 339*7c478bd9Sstevel@tonic-gate if (DISP_PRIO(next_reader) < wpri) 340*7c478bd9Sstevel@tonic-gate break; 341*7c478bd9Sstevel@tonic-gate next_reader->t_kpri_req++; 342*7c478bd9Sstevel@tonic-gate next_reader = next_reader->t_link; 343*7c478bd9Sstevel@tonic-gate count++; 344*7c478bd9Sstevel@tonic-gate } 345*7c478bd9Sstevel@tonic-gate return (count); 346*7c478bd9Sstevel@tonic-gate } 347*7c478bd9Sstevel@tonic-gate 348*7c478bd9Sstevel@tonic-gate /* 349*7c478bd9Sstevel@tonic-gate * Full-service implementation of rw_exit() to handle all the hard cases. 350*7c478bd9Sstevel@tonic-gate * Called from the assembly version if anything complicated is going on. 351*7c478bd9Sstevel@tonic-gate * There is no semantic difference between calling rw_exit() and calling 352*7c478bd9Sstevel@tonic-gate * rw_exit_wakeup() directly. 353*7c478bd9Sstevel@tonic-gate */ 354*7c478bd9Sstevel@tonic-gate void 355*7c478bd9Sstevel@tonic-gate rw_exit_wakeup(rwlock_impl_t *lp) 356*7c478bd9Sstevel@tonic-gate { 357*7c478bd9Sstevel@tonic-gate turnstile_t *ts; 358*7c478bd9Sstevel@tonic-gate uintptr_t old, new, lock_value; 359*7c478bd9Sstevel@tonic-gate kthread_t *next_writer; 360*7c478bd9Sstevel@tonic-gate int nreaders; 361*7c478bd9Sstevel@tonic-gate 362*7c478bd9Sstevel@tonic-gate membar_exit(); 363*7c478bd9Sstevel@tonic-gate 364*7c478bd9Sstevel@tonic-gate old = lp->rw_wwwh; 365*7c478bd9Sstevel@tonic-gate if (old & RW_WRITE_LOCKED) { 366*7c478bd9Sstevel@tonic-gate if ((old & RW_OWNER) != (uintptr_t)curthread) { 367*7c478bd9Sstevel@tonic-gate rw_panic("rw_exit: not owner", lp); 368*7c478bd9Sstevel@tonic-gate lp->rw_wwwh = 0; 369*7c478bd9Sstevel@tonic-gate return; 370*7c478bd9Sstevel@tonic-gate } 371*7c478bd9Sstevel@tonic-gate lock_value = RW_WRITE_LOCK(curthread); 372*7c478bd9Sstevel@tonic-gate } else { 373*7c478bd9Sstevel@tonic-gate if ((old & RW_LOCKED) == 0) { 374*7c478bd9Sstevel@tonic-gate rw_panic("rw_exit: lock not held", lp); 375*7c478bd9Sstevel@tonic-gate return; 376*7c478bd9Sstevel@tonic-gate } 377*7c478bd9Sstevel@tonic-gate lock_value = RW_READ_LOCK; 378*7c478bd9Sstevel@tonic-gate } 379*7c478bd9Sstevel@tonic-gate 380*7c478bd9Sstevel@tonic-gate for (;;) { 381*7c478bd9Sstevel@tonic-gate /* 382*7c478bd9Sstevel@tonic-gate * If this is *not* the final exit of a lock with waiters, 383*7c478bd9Sstevel@tonic-gate * just drop the lock -- there's nothing tricky going on. 384*7c478bd9Sstevel@tonic-gate */ 385*7c478bd9Sstevel@tonic-gate old = lp->rw_wwwh; 386*7c478bd9Sstevel@tonic-gate new = old - lock_value; 387*7c478bd9Sstevel@tonic-gate if ((new & (RW_LOCKED | RW_HAS_WAITERS)) != RW_HAS_WAITERS) { 388*7c478bd9Sstevel@tonic-gate if (casip(&lp->rw_wwwh, old, new) != old) 389*7c478bd9Sstevel@tonic-gate continue; 390*7c478bd9Sstevel@tonic-gate break; 391*7c478bd9Sstevel@tonic-gate } 392*7c478bd9Sstevel@tonic-gate 393*7c478bd9Sstevel@tonic-gate /* 394*7c478bd9Sstevel@tonic-gate * Perform the final exit of a lock that has waiters. 395*7c478bd9Sstevel@tonic-gate */ 396*7c478bd9Sstevel@tonic-gate ts = turnstile_lookup(lp); 397*7c478bd9Sstevel@tonic-gate 398*7c478bd9Sstevel@tonic-gate next_writer = ts->ts_sleepq[TS_WRITER_Q].sq_first; 399*7c478bd9Sstevel@tonic-gate 400*7c478bd9Sstevel@tonic-gate if ((old & RW_WRITE_LOCKED) && 401*7c478bd9Sstevel@tonic-gate (nreaders = rw_readers_to_wake(ts)) > 0) { 402*7c478bd9Sstevel@tonic-gate /* 403*7c478bd9Sstevel@tonic-gate * Don't drop the lock -- just set the hold count 404*7c478bd9Sstevel@tonic-gate * such that we grant the lock to all readers at once. 405*7c478bd9Sstevel@tonic-gate */ 406*7c478bd9Sstevel@tonic-gate new = nreaders * RW_READ_LOCK; 407*7c478bd9Sstevel@tonic-gate if (ts->ts_waiters > nreaders) 408*7c478bd9Sstevel@tonic-gate new |= RW_HAS_WAITERS; 409*7c478bd9Sstevel@tonic-gate if (next_writer) 410*7c478bd9Sstevel@tonic-gate new |= RW_WRITE_WANTED; 411*7c478bd9Sstevel@tonic-gate lp->rw_wwwh = new; 412*7c478bd9Sstevel@tonic-gate membar_enter(); 413*7c478bd9Sstevel@tonic-gate turnstile_wakeup(ts, TS_READER_Q, nreaders, NULL); 414*7c478bd9Sstevel@tonic-gate } else { 415*7c478bd9Sstevel@tonic-gate /* 416*7c478bd9Sstevel@tonic-gate * Don't drop the lock -- just transfer ownership 417*7c478bd9Sstevel@tonic-gate * directly to next_writer. Note that there must 418*7c478bd9Sstevel@tonic-gate * be at least one waiting writer, because we get 419*7c478bd9Sstevel@tonic-gate * here only if (A) the lock is read-locked or 420*7c478bd9Sstevel@tonic-gate * (B) there are no waiting readers. In case (A), 421*7c478bd9Sstevel@tonic-gate * since the lock is read-locked there would be no 422*7c478bd9Sstevel@tonic-gate * reason for other readers to have blocked unless 423*7c478bd9Sstevel@tonic-gate * the RW_WRITE_WANTED bit was set. In case (B), 424*7c478bd9Sstevel@tonic-gate * since there are waiters but no waiting readers, 425*7c478bd9Sstevel@tonic-gate * they must all be waiting writers. 426*7c478bd9Sstevel@tonic-gate */ 427*7c478bd9Sstevel@tonic-gate ASSERT(lp->rw_wwwh & RW_WRITE_WANTED); 428*7c478bd9Sstevel@tonic-gate new = RW_WRITE_LOCK(next_writer); 429*7c478bd9Sstevel@tonic-gate if (ts->ts_waiters > 1) 430*7c478bd9Sstevel@tonic-gate new |= RW_HAS_WAITERS; 431*7c478bd9Sstevel@tonic-gate if (next_writer->t_link) 432*7c478bd9Sstevel@tonic-gate new |= RW_WRITE_WANTED; 433*7c478bd9Sstevel@tonic-gate lp->rw_wwwh = new; 434*7c478bd9Sstevel@tonic-gate membar_enter(); 435*7c478bd9Sstevel@tonic-gate turnstile_wakeup(ts, TS_WRITER_Q, 1, next_writer); 436*7c478bd9Sstevel@tonic-gate } 437*7c478bd9Sstevel@tonic-gate break; 438*7c478bd9Sstevel@tonic-gate } 439*7c478bd9Sstevel@tonic-gate 440*7c478bd9Sstevel@tonic-gate if (lock_value == RW_READ_LOCK) { 441*7c478bd9Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 442*7c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE, lp, RW_READER); 443*7c478bd9Sstevel@tonic-gate } else { 444*7c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE, lp, RW_WRITER); 445*7c478bd9Sstevel@tonic-gate } 446*7c478bd9Sstevel@tonic-gate } 447*7c478bd9Sstevel@tonic-gate 448*7c478bd9Sstevel@tonic-gate int 449*7c478bd9Sstevel@tonic-gate rw_tryenter(krwlock_t *rwlp, krw_t rw) 450*7c478bd9Sstevel@tonic-gate { 451*7c478bd9Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 452*7c478bd9Sstevel@tonic-gate uintptr_t old; 453*7c478bd9Sstevel@tonic-gate 454*7c478bd9Sstevel@tonic-gate if (rw == RW_READER) { 455*7c478bd9Sstevel@tonic-gate THREAD_KPRI_REQUEST(); 456*7c478bd9Sstevel@tonic-gate do { 457*7c478bd9Sstevel@tonic-gate if ((old = lp->rw_wwwh) & RW_WRITE_CLAIMED) { 458*7c478bd9Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 459*7c478bd9Sstevel@tonic-gate return (0); 460*7c478bd9Sstevel@tonic-gate } 461*7c478bd9Sstevel@tonic-gate } while (casip(&lp->rw_wwwh, old, old + RW_READ_LOCK) != old); 462*7c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE, lp, rw); 463*7c478bd9Sstevel@tonic-gate } else { 464*7c478bd9Sstevel@tonic-gate if (casip(&lp->rw_wwwh, 0, RW_WRITE_LOCK(curthread)) != 0) 465*7c478bd9Sstevel@tonic-gate return (0); 466*7c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE, lp, rw); 467*7c478bd9Sstevel@tonic-gate } 468*7c478bd9Sstevel@tonic-gate ASSERT(rw_locked(lp, rw)); 469*7c478bd9Sstevel@tonic-gate membar_enter(); 470*7c478bd9Sstevel@tonic-gate return (1); 471*7c478bd9Sstevel@tonic-gate } 472*7c478bd9Sstevel@tonic-gate 473*7c478bd9Sstevel@tonic-gate void 474*7c478bd9Sstevel@tonic-gate rw_downgrade(krwlock_t *rwlp) 475*7c478bd9Sstevel@tonic-gate { 476*7c478bd9Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 477*7c478bd9Sstevel@tonic-gate 478*7c478bd9Sstevel@tonic-gate THREAD_KPRI_REQUEST(); 479*7c478bd9Sstevel@tonic-gate membar_exit(); 480*7c478bd9Sstevel@tonic-gate 481*7c478bd9Sstevel@tonic-gate if ((lp->rw_wwwh & RW_OWNER) != (uintptr_t)curthread) { 482*7c478bd9Sstevel@tonic-gate rw_panic("rw_downgrade: not owner", lp); 483*7c478bd9Sstevel@tonic-gate return; 484*7c478bd9Sstevel@tonic-gate } 485*7c478bd9Sstevel@tonic-gate 486*7c478bd9Sstevel@tonic-gate if (atomic_add_ip_nv(&lp->rw_wwwh, 487*7c478bd9Sstevel@tonic-gate RW_READ_LOCK - RW_WRITE_LOCK(curthread)) & RW_HAS_WAITERS) { 488*7c478bd9Sstevel@tonic-gate turnstile_t *ts = turnstile_lookup(lp); 489*7c478bd9Sstevel@tonic-gate int nreaders = rw_readers_to_wake(ts); 490*7c478bd9Sstevel@tonic-gate if (nreaders > 0) { 491*7c478bd9Sstevel@tonic-gate uintptr_t delta = nreaders * RW_READ_LOCK; 492*7c478bd9Sstevel@tonic-gate if (ts->ts_waiters == nreaders) 493*7c478bd9Sstevel@tonic-gate delta -= RW_HAS_WAITERS; 494*7c478bd9Sstevel@tonic-gate atomic_add_ip(&lp->rw_wwwh, delta); 495*7c478bd9Sstevel@tonic-gate } 496*7c478bd9Sstevel@tonic-gate turnstile_wakeup(ts, TS_READER_Q, nreaders, NULL); 497*7c478bd9Sstevel@tonic-gate } 498*7c478bd9Sstevel@tonic-gate ASSERT(rw_locked(lp, RW_READER)); 499*7c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_RW_DOWNGRADE_DOWNGRADE, lp); 500*7c478bd9Sstevel@tonic-gate } 501*7c478bd9Sstevel@tonic-gate 502*7c478bd9Sstevel@tonic-gate int 503*7c478bd9Sstevel@tonic-gate rw_tryupgrade(krwlock_t *rwlp) 504*7c478bd9Sstevel@tonic-gate { 505*7c478bd9Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 506*7c478bd9Sstevel@tonic-gate uintptr_t old, new; 507*7c478bd9Sstevel@tonic-gate 508*7c478bd9Sstevel@tonic-gate ASSERT(rw_locked(lp, RW_READER)); 509*7c478bd9Sstevel@tonic-gate 510*7c478bd9Sstevel@tonic-gate do { 511*7c478bd9Sstevel@tonic-gate if (((old = lp->rw_wwwh) & ~RW_HAS_WAITERS) != RW_READ_LOCK) 512*7c478bd9Sstevel@tonic-gate return (0); 513*7c478bd9Sstevel@tonic-gate new = old + RW_WRITE_LOCK(curthread) - RW_READ_LOCK; 514*7c478bd9Sstevel@tonic-gate } while (casip(&lp->rw_wwwh, old, new) != old); 515*7c478bd9Sstevel@tonic-gate 516*7c478bd9Sstevel@tonic-gate membar_enter(); 517*7c478bd9Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 518*7c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_RW_TRYUPGRADE_UPGRADE, lp); 519*7c478bd9Sstevel@tonic-gate ASSERT(rw_locked(lp, RW_WRITER)); 520*7c478bd9Sstevel@tonic-gate return (1); 521*7c478bd9Sstevel@tonic-gate } 522*7c478bd9Sstevel@tonic-gate 523*7c478bd9Sstevel@tonic-gate int 524*7c478bd9Sstevel@tonic-gate rw_read_held(krwlock_t *rwlp) 525*7c478bd9Sstevel@tonic-gate { 526*7c478bd9Sstevel@tonic-gate uintptr_t tmp; 527*7c478bd9Sstevel@tonic-gate 528*7c478bd9Sstevel@tonic-gate return (_RW_READ_HELD(rwlp, tmp)); 529*7c478bd9Sstevel@tonic-gate } 530*7c478bd9Sstevel@tonic-gate 531*7c478bd9Sstevel@tonic-gate int 532*7c478bd9Sstevel@tonic-gate rw_write_held(krwlock_t *rwlp) 533*7c478bd9Sstevel@tonic-gate { 534*7c478bd9Sstevel@tonic-gate return (_RW_WRITE_HELD(rwlp)); 535*7c478bd9Sstevel@tonic-gate } 536*7c478bd9Sstevel@tonic-gate 537*7c478bd9Sstevel@tonic-gate int 538*7c478bd9Sstevel@tonic-gate rw_lock_held(krwlock_t *rwlp) 539*7c478bd9Sstevel@tonic-gate { 540*7c478bd9Sstevel@tonic-gate return (_RW_LOCK_HELD(rwlp)); 541*7c478bd9Sstevel@tonic-gate } 542*7c478bd9Sstevel@tonic-gate 543*7c478bd9Sstevel@tonic-gate /* 544*7c478bd9Sstevel@tonic-gate * Like rw_read_held(), but ASSERTs that the lock is currently held 545*7c478bd9Sstevel@tonic-gate */ 546*7c478bd9Sstevel@tonic-gate int 547*7c478bd9Sstevel@tonic-gate rw_read_locked(krwlock_t *rwlp) 548*7c478bd9Sstevel@tonic-gate { 549*7c478bd9Sstevel@tonic-gate uintptr_t old = ((rwlock_impl_t *)rwlp)->rw_wwwh; 550*7c478bd9Sstevel@tonic-gate 551*7c478bd9Sstevel@tonic-gate ASSERT(old & RW_LOCKED); 552*7c478bd9Sstevel@tonic-gate return ((old & RW_LOCKED) && !(old & RW_WRITE_LOCKED)); 553*7c478bd9Sstevel@tonic-gate } 554*7c478bd9Sstevel@tonic-gate 555*7c478bd9Sstevel@tonic-gate /* 556*7c478bd9Sstevel@tonic-gate * Returns non-zero if the lock is either held or desired by a writer 557*7c478bd9Sstevel@tonic-gate */ 558*7c478bd9Sstevel@tonic-gate int 559*7c478bd9Sstevel@tonic-gate rw_iswriter(krwlock_t *rwlp) 560*7c478bd9Sstevel@tonic-gate { 561*7c478bd9Sstevel@tonic-gate return (_RW_ISWRITER(rwlp)); 562*7c478bd9Sstevel@tonic-gate } 563*7c478bd9Sstevel@tonic-gate 564*7c478bd9Sstevel@tonic-gate kthread_t * 565*7c478bd9Sstevel@tonic-gate rw_owner(krwlock_t *rwlp) 566*7c478bd9Sstevel@tonic-gate { 567*7c478bd9Sstevel@tonic-gate uintptr_t old = ((rwlock_impl_t *)rwlp)->rw_wwwh; 568*7c478bd9Sstevel@tonic-gate 569*7c478bd9Sstevel@tonic-gate return ((old & RW_WRITE_LOCKED) ? (kthread_t *)(old & RW_OWNER) : NULL); 570*7c478bd9Sstevel@tonic-gate } 571