17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*374ae87fSsvemuri * Common Development and Distribution License (the "License"). 6*374ae87fSsvemuri * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 217c478bd9Sstevel@tonic-gate /* 22*374ae87fSsvemuri * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 237c478bd9Sstevel@tonic-gate * Use is subject to license terms. 247c478bd9Sstevel@tonic-gate */ 257c478bd9Sstevel@tonic-gate 267c478bd9Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 277c478bd9Sstevel@tonic-gate 287c478bd9Sstevel@tonic-gate #include <sys/param.h> 297c478bd9Sstevel@tonic-gate #include <sys/thread.h> 307c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 317c478bd9Sstevel@tonic-gate #include <sys/debug.h> 327c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h> 337c478bd9Sstevel@tonic-gate #include <sys/sobject.h> 347c478bd9Sstevel@tonic-gate #include <sys/turnstile.h> 357c478bd9Sstevel@tonic-gate #include <sys/rwlock.h> 367c478bd9Sstevel@tonic-gate #include <sys/rwlock_impl.h> 377c478bd9Sstevel@tonic-gate #include <sys/atomic.h> 387c478bd9Sstevel@tonic-gate #include <sys/lockstat.h> 397c478bd9Sstevel@tonic-gate 407c478bd9Sstevel@tonic-gate /* 417c478bd9Sstevel@tonic-gate * Big Theory Statement for readers/writer locking primitives. 427c478bd9Sstevel@tonic-gate * 437c478bd9Sstevel@tonic-gate * An rwlock provides exclusive access to a single thread ("writer") or 447c478bd9Sstevel@tonic-gate * concurrent access to multiple threads ("readers"). See rwlock(9F) 457c478bd9Sstevel@tonic-gate * for a full description of the interfaces and programming model. 467c478bd9Sstevel@tonic-gate * The rest of this comment describes the implementation. 477c478bd9Sstevel@tonic-gate * 487c478bd9Sstevel@tonic-gate * An rwlock is a single word with the following structure: 497c478bd9Sstevel@tonic-gate * 507c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------- 517c478bd9Sstevel@tonic-gate * | OWNER (writer) or HOLD COUNT (readers) | WRLOCK | WRWANT | WAIT | 527c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------- 537c478bd9Sstevel@tonic-gate * 63 / 31 .. 3 2 1 0 547c478bd9Sstevel@tonic-gate * 557c478bd9Sstevel@tonic-gate * The waiters bit (0) indicates whether any threads are blocked waiting 567c478bd9Sstevel@tonic-gate * for the lock. The write-wanted bit (1) indicates whether any threads 577c478bd9Sstevel@tonic-gate * are blocked waiting for write access. The write-locked bit (2) indicates 587c478bd9Sstevel@tonic-gate * whether the lock is held by a writer, which determines whether the upper 597c478bd9Sstevel@tonic-gate * bits (3..31 in ILP32, 3..63 in LP64) should be interpreted as the owner 607c478bd9Sstevel@tonic-gate * (thread pointer) or the hold count (number of readers). 617c478bd9Sstevel@tonic-gate * 627c478bd9Sstevel@tonic-gate * In the absence of any contention, a writer gets the lock by setting 637c478bd9Sstevel@tonic-gate * this word to (curthread | RW_WRITE_LOCKED); a reader gets the lock 647c478bd9Sstevel@tonic-gate * by incrementing the hold count (i.e. adding 8, aka RW_READ_LOCK). 657c478bd9Sstevel@tonic-gate * 667c478bd9Sstevel@tonic-gate * A writer will fail to acquire the lock if any other thread owns it. 677c478bd9Sstevel@tonic-gate * A reader will fail if the lock is either owned or wanted by a writer. 687c478bd9Sstevel@tonic-gate * rw_tryenter() returns 0 in these cases; rw_enter() blocks until the 697c478bd9Sstevel@tonic-gate * lock becomes available. 707c478bd9Sstevel@tonic-gate * 717c478bd9Sstevel@tonic-gate * When a thread blocks it acquires the rwlock's hashed turnstile lock and 727c478bd9Sstevel@tonic-gate * attempts to set RW_HAS_WAITERS (and RW_WRITE_WANTED in the writer case) 737c478bd9Sstevel@tonic-gate * atomically *only if the lock still appears busy*. A thread must never 747c478bd9Sstevel@tonic-gate * accidentally block for an available lock since there would be no owner 757c478bd9Sstevel@tonic-gate * to awaken it. casip() provides the required atomicity. Once casip() 767c478bd9Sstevel@tonic-gate * succeeds, the decision to block becomes final and irreversible. The 777c478bd9Sstevel@tonic-gate * thread will not become runnable again until it has been granted ownership 787c478bd9Sstevel@tonic-gate * of the lock via direct handoff from a former owner as described below. 797c478bd9Sstevel@tonic-gate * 807c478bd9Sstevel@tonic-gate * In the absence of any waiters, rw_exit() just clears the lock (if it 817c478bd9Sstevel@tonic-gate * is write-locked) or decrements the hold count (if it is read-locked). 827c478bd9Sstevel@tonic-gate * Note that even if waiters are present, decrementing the hold count 837c478bd9Sstevel@tonic-gate * to a non-zero value requires no special action since the lock is still 847c478bd9Sstevel@tonic-gate * held by at least one other thread. 857c478bd9Sstevel@tonic-gate * 867c478bd9Sstevel@tonic-gate * On the "final exit" (transition to unheld state) of a lock with waiters, 877c478bd9Sstevel@tonic-gate * rw_exit_wakeup() grabs the turnstile lock and transfers ownership directly 887c478bd9Sstevel@tonic-gate * to the next writer or set of readers. There are several advantages to this 897c478bd9Sstevel@tonic-gate * approach: (1) it closes all windows for priority inversion (when a new 907c478bd9Sstevel@tonic-gate * writer has grabbed the lock but has not yet inherited from blocked readers); 917c478bd9Sstevel@tonic-gate * (2) it prevents starvation of equal-priority threads by granting the lock 927c478bd9Sstevel@tonic-gate * in FIFO order; (3) it eliminates the need for a write-wanted count -- a 937c478bd9Sstevel@tonic-gate * single bit suffices because the lock remains held until all waiting 947c478bd9Sstevel@tonic-gate * writers are gone; (4) when we awaken N readers we can perform a single 957c478bd9Sstevel@tonic-gate * "atomic_add(&x, N)" to set the total hold count rather than having all N 967c478bd9Sstevel@tonic-gate * threads fight for the cache to perform an "atomic_add(&x, 1)" upon wakeup. 977c478bd9Sstevel@tonic-gate * 987c478bd9Sstevel@tonic-gate * The most interesting policy decision in rw_exit_wakeup() is which thread 997c478bd9Sstevel@tonic-gate * to wake. Starvation is always possible with priority-based scheduling, 1007c478bd9Sstevel@tonic-gate * but any sane wakeup policy should at least satisfy these requirements: 1017c478bd9Sstevel@tonic-gate * 1027c478bd9Sstevel@tonic-gate * (1) The highest-priority thread in the system should not starve. 1037c478bd9Sstevel@tonic-gate * (2) The highest-priority writer should not starve. 1047c478bd9Sstevel@tonic-gate * (3) No writer should starve due to lower-priority threads. 1057c478bd9Sstevel@tonic-gate * (4) No reader should starve due to lower-priority writers. 1067c478bd9Sstevel@tonic-gate * (5) If all threads have equal priority, none of them should starve. 1077c478bd9Sstevel@tonic-gate * 1087c478bd9Sstevel@tonic-gate * We used to employ a writers-always-win policy, which doesn't even 1097c478bd9Sstevel@tonic-gate * satisfy (1): a steady stream of low-priority writers can starve out 1107c478bd9Sstevel@tonic-gate * a real-time reader! This is clearly a broken policy -- it violates 1117c478bd9Sstevel@tonic-gate * (1), (4), and (5) -- but it's how rwlocks always used to behave. 1127c478bd9Sstevel@tonic-gate * 1137c478bd9Sstevel@tonic-gate * A round-robin policy (exiting readers grant the lock to blocked writers 1147c478bd9Sstevel@tonic-gate * and vice versa) satisfies all but (3): a single high-priority writer 1157c478bd9Sstevel@tonic-gate * and many low-priority readers can starve out medium-priority writers. 1167c478bd9Sstevel@tonic-gate * 1177c478bd9Sstevel@tonic-gate * A strict priority policy (grant the lock to the highest priority blocked 1187c478bd9Sstevel@tonic-gate * thread) satisfies everything but (2): a steady stream of high-priority 1197c478bd9Sstevel@tonic-gate * readers can permanently starve the highest-priority writer. 1207c478bd9Sstevel@tonic-gate * 1217c478bd9Sstevel@tonic-gate * The reason we care about (2) is that it's important to process writers 1227c478bd9Sstevel@tonic-gate * reasonably quickly -- even if they're low priority -- because their very 1237c478bd9Sstevel@tonic-gate * presence causes all readers to take the slow (blocking) path through this 1247c478bd9Sstevel@tonic-gate * code. There is also a general sense that writers deserve some degree of 1257c478bd9Sstevel@tonic-gate * deference because they're updating the data upon which all readers act. 1267c478bd9Sstevel@tonic-gate * Presumably this data should not be allowed to become arbitrarily stale 1277c478bd9Sstevel@tonic-gate * due to writer starvation. Finally, it seems reasonable to level the 1287c478bd9Sstevel@tonic-gate * playing field a bit to compensate for the fact that it's so much harder 1297c478bd9Sstevel@tonic-gate * for a writer to get in when there are already many readers present. 1307c478bd9Sstevel@tonic-gate * 1317c478bd9Sstevel@tonic-gate * A hybrid of round-robin and strict priority can be made to satisfy 1327c478bd9Sstevel@tonic-gate * all five criteria. In this "writer priority policy" exiting readers 1337c478bd9Sstevel@tonic-gate * always grant the lock to waiting writers, but exiting writers only 1347c478bd9Sstevel@tonic-gate * grant the lock to readers of the same or higher priority than the 1357c478bd9Sstevel@tonic-gate * highest-priority blocked writer. Thus requirement (2) is satisfied, 1367c478bd9Sstevel@tonic-gate * necessarily, by a willful act of priority inversion: an exiting reader 1377c478bd9Sstevel@tonic-gate * will grant the lock to a blocked writer even if there are blocked 1387c478bd9Sstevel@tonic-gate * readers of higher priority. The situation is mitigated by the fact 1397c478bd9Sstevel@tonic-gate * that writers always inherit priority from blocked readers, and the 1407c478bd9Sstevel@tonic-gate * writer will awaken those readers as soon as it exits the lock. 1417c478bd9Sstevel@tonic-gate * 1427c478bd9Sstevel@tonic-gate * rw_downgrade() follows the same wakeup policy as an exiting writer. 1437c478bd9Sstevel@tonic-gate * 1447c478bd9Sstevel@tonic-gate * rw_tryupgrade() has the same failure mode as rw_tryenter() for a 1457c478bd9Sstevel@tonic-gate * write lock. Both honor the WRITE_WANTED bit by specification. 1467c478bd9Sstevel@tonic-gate * 1477c478bd9Sstevel@tonic-gate * The following rules apply to manipulation of rwlock internal state: 1487c478bd9Sstevel@tonic-gate * 1497c478bd9Sstevel@tonic-gate * (1) The rwlock is only modified via the atomic primitives casip() 1507c478bd9Sstevel@tonic-gate * and atomic_add_ip(). 1517c478bd9Sstevel@tonic-gate * 1527c478bd9Sstevel@tonic-gate * (2) The waiters bit and write-wanted bit are only modified under 1537c478bd9Sstevel@tonic-gate * turnstile_lookup(). This ensures that the turnstile is consistent 1547c478bd9Sstevel@tonic-gate * with the rwlock. 1557c478bd9Sstevel@tonic-gate * 1567c478bd9Sstevel@tonic-gate * (3) Waiters receive the lock by direct handoff from the previous 1577c478bd9Sstevel@tonic-gate * owner. Therefore, waiters *always* wake up holding the lock. 1587c478bd9Sstevel@tonic-gate */ 1597c478bd9Sstevel@tonic-gate 1607c478bd9Sstevel@tonic-gate /* 1617c478bd9Sstevel@tonic-gate * The sobj_ops vector exports a set of functions needed when a thread 1627c478bd9Sstevel@tonic-gate * is asleep on a synchronization object of a given type. 1637c478bd9Sstevel@tonic-gate */ 1647c478bd9Sstevel@tonic-gate static sobj_ops_t rw_sobj_ops = { 1657c478bd9Sstevel@tonic-gate SOBJ_RWLOCK, rw_owner, turnstile_stay_asleep, turnstile_change_pri 1667c478bd9Sstevel@tonic-gate }; 1677c478bd9Sstevel@tonic-gate 1687c478bd9Sstevel@tonic-gate /* 1697c478bd9Sstevel@tonic-gate * If the system panics on an rwlock, save the address of the offending 1707c478bd9Sstevel@tonic-gate * rwlock in panic_rwlock_addr, and save the contents in panic_rwlock. 1717c478bd9Sstevel@tonic-gate */ 1727c478bd9Sstevel@tonic-gate static rwlock_impl_t panic_rwlock; 1737c478bd9Sstevel@tonic-gate static rwlock_impl_t *panic_rwlock_addr; 1747c478bd9Sstevel@tonic-gate 1757c478bd9Sstevel@tonic-gate static void 1767c478bd9Sstevel@tonic-gate rw_panic(char *msg, rwlock_impl_t *lp) 1777c478bd9Sstevel@tonic-gate { 1787c478bd9Sstevel@tonic-gate if (panicstr) 1797c478bd9Sstevel@tonic-gate return; 1807c478bd9Sstevel@tonic-gate 1817c478bd9Sstevel@tonic-gate if (casptr(&panic_rwlock_addr, NULL, lp) == NULL) 1827c478bd9Sstevel@tonic-gate panic_rwlock = *lp; 1837c478bd9Sstevel@tonic-gate 1847c478bd9Sstevel@tonic-gate panic("%s, lp=%p wwwh=%lx thread=%p", 1857c478bd9Sstevel@tonic-gate msg, lp, panic_rwlock.rw_wwwh, curthread); 1867c478bd9Sstevel@tonic-gate } 1877c478bd9Sstevel@tonic-gate 1887c478bd9Sstevel@tonic-gate /* ARGSUSED */ 1897c478bd9Sstevel@tonic-gate void 1907c478bd9Sstevel@tonic-gate rw_init(krwlock_t *rwlp, char *name, krw_type_t type, void *arg) 1917c478bd9Sstevel@tonic-gate { 1927c478bd9Sstevel@tonic-gate ((rwlock_impl_t *)rwlp)->rw_wwwh = 0; 1937c478bd9Sstevel@tonic-gate } 1947c478bd9Sstevel@tonic-gate 1957c478bd9Sstevel@tonic-gate void 1967c478bd9Sstevel@tonic-gate rw_destroy(krwlock_t *rwlp) 1977c478bd9Sstevel@tonic-gate { 1987c478bd9Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 1997c478bd9Sstevel@tonic-gate 2007c478bd9Sstevel@tonic-gate if (lp->rw_wwwh != 0) { 2017c478bd9Sstevel@tonic-gate if ((lp->rw_wwwh & RW_DOUBLE_LOCK) == RW_DOUBLE_LOCK) 2027c478bd9Sstevel@tonic-gate rw_panic("rw_destroy: lock already destroyed", lp); 2037c478bd9Sstevel@tonic-gate else 2047c478bd9Sstevel@tonic-gate rw_panic("rw_destroy: lock still active", lp); 2057c478bd9Sstevel@tonic-gate } 2067c478bd9Sstevel@tonic-gate 2077c478bd9Sstevel@tonic-gate lp->rw_wwwh = RW_DOUBLE_LOCK; 2087c478bd9Sstevel@tonic-gate } 2097c478bd9Sstevel@tonic-gate 2107c478bd9Sstevel@tonic-gate /* 2117c478bd9Sstevel@tonic-gate * Verify that an rwlock is held correctly. 2127c478bd9Sstevel@tonic-gate */ 2137c478bd9Sstevel@tonic-gate static int 2147c478bd9Sstevel@tonic-gate rw_locked(rwlock_impl_t *lp, krw_t rw) 2157c478bd9Sstevel@tonic-gate { 2167c478bd9Sstevel@tonic-gate uintptr_t old = lp->rw_wwwh; 2177c478bd9Sstevel@tonic-gate 2187c478bd9Sstevel@tonic-gate if (rw == RW_READER) 2197c478bd9Sstevel@tonic-gate return ((old & RW_LOCKED) && !(old & RW_WRITE_LOCKED)); 2207c478bd9Sstevel@tonic-gate 2217c478bd9Sstevel@tonic-gate if (rw == RW_WRITER) 2227c478bd9Sstevel@tonic-gate return ((old & RW_OWNER) == (uintptr_t)curthread); 2237c478bd9Sstevel@tonic-gate 2247c478bd9Sstevel@tonic-gate return (0); 2257c478bd9Sstevel@tonic-gate } 2267c478bd9Sstevel@tonic-gate 227*374ae87fSsvemuri uint_t (*rw_lock_backoff)(uint_t) = NULL; 228*374ae87fSsvemuri void (*rw_lock_delay)(uint_t) = NULL; 229*374ae87fSsvemuri 2307c478bd9Sstevel@tonic-gate /* 2317c478bd9Sstevel@tonic-gate * Full-service implementation of rw_enter() to handle all the hard cases. 2327c478bd9Sstevel@tonic-gate * Called from the assembly version if anything complicated is going on. 2337c478bd9Sstevel@tonic-gate * The only semantic difference between calling rw_enter() and calling 2347c478bd9Sstevel@tonic-gate * rw_enter_sleep() directly is that we assume the caller has already done 2357c478bd9Sstevel@tonic-gate * a THREAD_KPRI_REQUEST() in the RW_READER case. 2367c478bd9Sstevel@tonic-gate */ 2377c478bd9Sstevel@tonic-gate void 2387c478bd9Sstevel@tonic-gate rw_enter_sleep(rwlock_impl_t *lp, krw_t rw) 2397c478bd9Sstevel@tonic-gate { 2407c478bd9Sstevel@tonic-gate uintptr_t old, new, lock_value, lock_busy, lock_wait; 2417c478bd9Sstevel@tonic-gate hrtime_t sleep_time; 2427c478bd9Sstevel@tonic-gate turnstile_t *ts; 243*374ae87fSsvemuri uint_t backoff = 0; 244*374ae87fSsvemuri int loop_count = 0; 2457c478bd9Sstevel@tonic-gate 2467c478bd9Sstevel@tonic-gate if (rw == RW_READER) { 2477c478bd9Sstevel@tonic-gate lock_value = RW_READ_LOCK; 2487c478bd9Sstevel@tonic-gate lock_busy = RW_WRITE_CLAIMED; 2497c478bd9Sstevel@tonic-gate lock_wait = RW_HAS_WAITERS; 2507c478bd9Sstevel@tonic-gate } else { 2517c478bd9Sstevel@tonic-gate lock_value = RW_WRITE_LOCK(curthread); 2527c478bd9Sstevel@tonic-gate lock_busy = (uintptr_t)RW_LOCKED; 2537c478bd9Sstevel@tonic-gate lock_wait = RW_HAS_WAITERS | RW_WRITE_WANTED; 2547c478bd9Sstevel@tonic-gate } 2557c478bd9Sstevel@tonic-gate 2567c478bd9Sstevel@tonic-gate for (;;) { 2577c478bd9Sstevel@tonic-gate if (((old = lp->rw_wwwh) & lock_busy) == 0) { 258*374ae87fSsvemuri if (casip(&lp->rw_wwwh, old, old + lock_value) != old) { 259*374ae87fSsvemuri if (rw_lock_delay != NULL) { 260*374ae87fSsvemuri backoff = rw_lock_backoff(backoff); 261*374ae87fSsvemuri rw_lock_delay(backoff); 262*374ae87fSsvemuri if (++loop_count == ncpus_online) { 263*374ae87fSsvemuri backoff = 0; 264*374ae87fSsvemuri loop_count = 0; 265*374ae87fSsvemuri } 266*374ae87fSsvemuri } 2677c478bd9Sstevel@tonic-gate continue; 268*374ae87fSsvemuri } 2697c478bd9Sstevel@tonic-gate break; 2707c478bd9Sstevel@tonic-gate } 2717c478bd9Sstevel@tonic-gate 2727c478bd9Sstevel@tonic-gate if (panicstr) 2737c478bd9Sstevel@tonic-gate return; 2747c478bd9Sstevel@tonic-gate 2757c478bd9Sstevel@tonic-gate if ((old & RW_DOUBLE_LOCK) == RW_DOUBLE_LOCK) { 2767c478bd9Sstevel@tonic-gate rw_panic("rw_enter: bad rwlock", lp); 2777c478bd9Sstevel@tonic-gate return; 2787c478bd9Sstevel@tonic-gate } 2797c478bd9Sstevel@tonic-gate 2807c478bd9Sstevel@tonic-gate if ((old & RW_OWNER) == (uintptr_t)curthread) { 2817c478bd9Sstevel@tonic-gate rw_panic("recursive rw_enter", lp); 2827c478bd9Sstevel@tonic-gate return; 2837c478bd9Sstevel@tonic-gate } 2847c478bd9Sstevel@tonic-gate 2857c478bd9Sstevel@tonic-gate ts = turnstile_lookup(lp); 2867c478bd9Sstevel@tonic-gate 2877c478bd9Sstevel@tonic-gate do { 2887c478bd9Sstevel@tonic-gate if (((old = lp->rw_wwwh) & lock_busy) == 0) 2897c478bd9Sstevel@tonic-gate break; 2907c478bd9Sstevel@tonic-gate new = old | lock_wait; 2917c478bd9Sstevel@tonic-gate } while (old != new && casip(&lp->rw_wwwh, old, new) != old); 2927c478bd9Sstevel@tonic-gate 2937c478bd9Sstevel@tonic-gate if ((old & lock_busy) == 0) { 2947c478bd9Sstevel@tonic-gate /* 2957c478bd9Sstevel@tonic-gate * The lock appears free now; try the dance again 2967c478bd9Sstevel@tonic-gate */ 2977c478bd9Sstevel@tonic-gate turnstile_exit(lp); 2987c478bd9Sstevel@tonic-gate continue; 2997c478bd9Sstevel@tonic-gate } 3007c478bd9Sstevel@tonic-gate 3017c478bd9Sstevel@tonic-gate /* 3027c478bd9Sstevel@tonic-gate * We really are going to block. Bump the stats, and drop 3037c478bd9Sstevel@tonic-gate * kpri if we're a reader. 3047c478bd9Sstevel@tonic-gate */ 3057c478bd9Sstevel@tonic-gate ASSERT(lp->rw_wwwh & lock_wait); 3067c478bd9Sstevel@tonic-gate ASSERT(lp->rw_wwwh & RW_LOCKED); 3077c478bd9Sstevel@tonic-gate 3087c478bd9Sstevel@tonic-gate sleep_time = -gethrtime(); 3097c478bd9Sstevel@tonic-gate if (rw == RW_READER) { 3107c478bd9Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 3117c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(CPU, sys, rw_rdfails, 1); 3127c478bd9Sstevel@tonic-gate (void) turnstile_block(ts, TS_READER_Q, lp, 3137c478bd9Sstevel@tonic-gate &rw_sobj_ops, NULL, NULL); 3147c478bd9Sstevel@tonic-gate } else { 3157c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(CPU, sys, rw_wrfails, 1); 3167c478bd9Sstevel@tonic-gate (void) turnstile_block(ts, TS_WRITER_Q, lp, 3177c478bd9Sstevel@tonic-gate &rw_sobj_ops, NULL, NULL); 3187c478bd9Sstevel@tonic-gate } 3197c478bd9Sstevel@tonic-gate sleep_time += gethrtime(); 3207c478bd9Sstevel@tonic-gate 3217c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD4(LS_RW_ENTER_BLOCK, lp, sleep_time, rw, 3227c478bd9Sstevel@tonic-gate (old & RW_WRITE_LOCKED) ? 1 : 0, 3237c478bd9Sstevel@tonic-gate old >> RW_HOLD_COUNT_SHIFT); 3247c478bd9Sstevel@tonic-gate 3257c478bd9Sstevel@tonic-gate /* 3267c478bd9Sstevel@tonic-gate * We wake up holding the lock (and having kpri if we're 3277c478bd9Sstevel@tonic-gate * a reader) via direct handoff from the previous owner. 3287c478bd9Sstevel@tonic-gate */ 3297c478bd9Sstevel@tonic-gate break; 3307c478bd9Sstevel@tonic-gate } 3317c478bd9Sstevel@tonic-gate 3327c478bd9Sstevel@tonic-gate ASSERT(rw_locked(lp, rw)); 3337c478bd9Sstevel@tonic-gate 3347c478bd9Sstevel@tonic-gate membar_enter(); 3357c478bd9Sstevel@tonic-gate 3367c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_ENTER_ACQUIRE, lp, rw); 3377c478bd9Sstevel@tonic-gate } 3387c478bd9Sstevel@tonic-gate 3397c478bd9Sstevel@tonic-gate /* 3407c478bd9Sstevel@tonic-gate * Return the number of readers to wake, or zero if we should wake a writer. 3417c478bd9Sstevel@tonic-gate * Called only by exiting/downgrading writers (readers don't wake readers). 3427c478bd9Sstevel@tonic-gate */ 3437c478bd9Sstevel@tonic-gate static int 3447c478bd9Sstevel@tonic-gate rw_readers_to_wake(turnstile_t *ts) 3457c478bd9Sstevel@tonic-gate { 3467c478bd9Sstevel@tonic-gate kthread_t *next_writer = ts->ts_sleepq[TS_WRITER_Q].sq_first; 3477c478bd9Sstevel@tonic-gate kthread_t *next_reader = ts->ts_sleepq[TS_READER_Q].sq_first; 3487c478bd9Sstevel@tonic-gate pri_t wpri = (next_writer != NULL) ? DISP_PRIO(next_writer) : -1; 3497c478bd9Sstevel@tonic-gate int count = 0; 3507c478bd9Sstevel@tonic-gate 3517c478bd9Sstevel@tonic-gate while (next_reader != NULL) { 3527c478bd9Sstevel@tonic-gate if (DISP_PRIO(next_reader) < wpri) 3537c478bd9Sstevel@tonic-gate break; 3547c478bd9Sstevel@tonic-gate next_reader->t_kpri_req++; 3557c478bd9Sstevel@tonic-gate next_reader = next_reader->t_link; 3567c478bd9Sstevel@tonic-gate count++; 3577c478bd9Sstevel@tonic-gate } 3587c478bd9Sstevel@tonic-gate return (count); 3597c478bd9Sstevel@tonic-gate } 3607c478bd9Sstevel@tonic-gate 3617c478bd9Sstevel@tonic-gate /* 3627c478bd9Sstevel@tonic-gate * Full-service implementation of rw_exit() to handle all the hard cases. 3637c478bd9Sstevel@tonic-gate * Called from the assembly version if anything complicated is going on. 3647c478bd9Sstevel@tonic-gate * There is no semantic difference between calling rw_exit() and calling 3657c478bd9Sstevel@tonic-gate * rw_exit_wakeup() directly. 3667c478bd9Sstevel@tonic-gate */ 3677c478bd9Sstevel@tonic-gate void 3687c478bd9Sstevel@tonic-gate rw_exit_wakeup(rwlock_impl_t *lp) 3697c478bd9Sstevel@tonic-gate { 3707c478bd9Sstevel@tonic-gate turnstile_t *ts; 3717c478bd9Sstevel@tonic-gate uintptr_t old, new, lock_value; 3727c478bd9Sstevel@tonic-gate kthread_t *next_writer; 3737c478bd9Sstevel@tonic-gate int nreaders; 374*374ae87fSsvemuri uint_t backoff = 0; 375*374ae87fSsvemuri int loop_count = 0; 3767c478bd9Sstevel@tonic-gate 3777c478bd9Sstevel@tonic-gate membar_exit(); 3787c478bd9Sstevel@tonic-gate 3797c478bd9Sstevel@tonic-gate old = lp->rw_wwwh; 3807c478bd9Sstevel@tonic-gate if (old & RW_WRITE_LOCKED) { 3817c478bd9Sstevel@tonic-gate if ((old & RW_OWNER) != (uintptr_t)curthread) { 3827c478bd9Sstevel@tonic-gate rw_panic("rw_exit: not owner", lp); 3837c478bd9Sstevel@tonic-gate lp->rw_wwwh = 0; 3847c478bd9Sstevel@tonic-gate return; 3857c478bd9Sstevel@tonic-gate } 3867c478bd9Sstevel@tonic-gate lock_value = RW_WRITE_LOCK(curthread); 3877c478bd9Sstevel@tonic-gate } else { 3887c478bd9Sstevel@tonic-gate if ((old & RW_LOCKED) == 0) { 3897c478bd9Sstevel@tonic-gate rw_panic("rw_exit: lock not held", lp); 3907c478bd9Sstevel@tonic-gate return; 3917c478bd9Sstevel@tonic-gate } 3927c478bd9Sstevel@tonic-gate lock_value = RW_READ_LOCK; 3937c478bd9Sstevel@tonic-gate } 3947c478bd9Sstevel@tonic-gate 3957c478bd9Sstevel@tonic-gate for (;;) { 3967c478bd9Sstevel@tonic-gate /* 3977c478bd9Sstevel@tonic-gate * If this is *not* the final exit of a lock with waiters, 3987c478bd9Sstevel@tonic-gate * just drop the lock -- there's nothing tricky going on. 3997c478bd9Sstevel@tonic-gate */ 4007c478bd9Sstevel@tonic-gate old = lp->rw_wwwh; 4017c478bd9Sstevel@tonic-gate new = old - lock_value; 4027c478bd9Sstevel@tonic-gate if ((new & (RW_LOCKED | RW_HAS_WAITERS)) != RW_HAS_WAITERS) { 403*374ae87fSsvemuri if (casip(&lp->rw_wwwh, old, new) != old) { 404*374ae87fSsvemuri if (rw_lock_delay != NULL) { 405*374ae87fSsvemuri backoff = rw_lock_backoff(backoff); 406*374ae87fSsvemuri rw_lock_delay(backoff); 407*374ae87fSsvemuri if (++loop_count == ncpus_online) { 408*374ae87fSsvemuri backoff = 0; 409*374ae87fSsvemuri loop_count = 0; 410*374ae87fSsvemuri } 411*374ae87fSsvemuri } 4127c478bd9Sstevel@tonic-gate continue; 413*374ae87fSsvemuri } 4147c478bd9Sstevel@tonic-gate break; 4157c478bd9Sstevel@tonic-gate } 4167c478bd9Sstevel@tonic-gate 4177c478bd9Sstevel@tonic-gate /* 4187c478bd9Sstevel@tonic-gate * Perform the final exit of a lock that has waiters. 4197c478bd9Sstevel@tonic-gate */ 4207c478bd9Sstevel@tonic-gate ts = turnstile_lookup(lp); 4217c478bd9Sstevel@tonic-gate 4227c478bd9Sstevel@tonic-gate next_writer = ts->ts_sleepq[TS_WRITER_Q].sq_first; 4237c478bd9Sstevel@tonic-gate 4247c478bd9Sstevel@tonic-gate if ((old & RW_WRITE_LOCKED) && 4257c478bd9Sstevel@tonic-gate (nreaders = rw_readers_to_wake(ts)) > 0) { 4267c478bd9Sstevel@tonic-gate /* 4277c478bd9Sstevel@tonic-gate * Don't drop the lock -- just set the hold count 4287c478bd9Sstevel@tonic-gate * such that we grant the lock to all readers at once. 4297c478bd9Sstevel@tonic-gate */ 4307c478bd9Sstevel@tonic-gate new = nreaders * RW_READ_LOCK; 4317c478bd9Sstevel@tonic-gate if (ts->ts_waiters > nreaders) 4327c478bd9Sstevel@tonic-gate new |= RW_HAS_WAITERS; 4337c478bd9Sstevel@tonic-gate if (next_writer) 4347c478bd9Sstevel@tonic-gate new |= RW_WRITE_WANTED; 4357c478bd9Sstevel@tonic-gate lp->rw_wwwh = new; 4367c478bd9Sstevel@tonic-gate membar_enter(); 4377c478bd9Sstevel@tonic-gate turnstile_wakeup(ts, TS_READER_Q, nreaders, NULL); 4387c478bd9Sstevel@tonic-gate } else { 4397c478bd9Sstevel@tonic-gate /* 4407c478bd9Sstevel@tonic-gate * Don't drop the lock -- just transfer ownership 4417c478bd9Sstevel@tonic-gate * directly to next_writer. Note that there must 4427c478bd9Sstevel@tonic-gate * be at least one waiting writer, because we get 4437c478bd9Sstevel@tonic-gate * here only if (A) the lock is read-locked or 4447c478bd9Sstevel@tonic-gate * (B) there are no waiting readers. In case (A), 4457c478bd9Sstevel@tonic-gate * since the lock is read-locked there would be no 4467c478bd9Sstevel@tonic-gate * reason for other readers to have blocked unless 4477c478bd9Sstevel@tonic-gate * the RW_WRITE_WANTED bit was set. In case (B), 4487c478bd9Sstevel@tonic-gate * since there are waiters but no waiting readers, 4497c478bd9Sstevel@tonic-gate * they must all be waiting writers. 4507c478bd9Sstevel@tonic-gate */ 4517c478bd9Sstevel@tonic-gate ASSERT(lp->rw_wwwh & RW_WRITE_WANTED); 4527c478bd9Sstevel@tonic-gate new = RW_WRITE_LOCK(next_writer); 4537c478bd9Sstevel@tonic-gate if (ts->ts_waiters > 1) 4547c478bd9Sstevel@tonic-gate new |= RW_HAS_WAITERS; 4557c478bd9Sstevel@tonic-gate if (next_writer->t_link) 4567c478bd9Sstevel@tonic-gate new |= RW_WRITE_WANTED; 4577c478bd9Sstevel@tonic-gate lp->rw_wwwh = new; 4587c478bd9Sstevel@tonic-gate membar_enter(); 4597c478bd9Sstevel@tonic-gate turnstile_wakeup(ts, TS_WRITER_Q, 1, next_writer); 4607c478bd9Sstevel@tonic-gate } 4617c478bd9Sstevel@tonic-gate break; 4627c478bd9Sstevel@tonic-gate } 4637c478bd9Sstevel@tonic-gate 4647c478bd9Sstevel@tonic-gate if (lock_value == RW_READ_LOCK) { 4657c478bd9Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 4667c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE, lp, RW_READER); 4677c478bd9Sstevel@tonic-gate } else { 4687c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE, lp, RW_WRITER); 4697c478bd9Sstevel@tonic-gate } 4707c478bd9Sstevel@tonic-gate } 4717c478bd9Sstevel@tonic-gate 4727c478bd9Sstevel@tonic-gate int 4737c478bd9Sstevel@tonic-gate rw_tryenter(krwlock_t *rwlp, krw_t rw) 4747c478bd9Sstevel@tonic-gate { 4757c478bd9Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 4767c478bd9Sstevel@tonic-gate uintptr_t old; 4777c478bd9Sstevel@tonic-gate 4787c478bd9Sstevel@tonic-gate if (rw == RW_READER) { 479*374ae87fSsvemuri uint_t backoff = 0; 480*374ae87fSsvemuri int loop_count = 0; 4817c478bd9Sstevel@tonic-gate THREAD_KPRI_REQUEST(); 482*374ae87fSsvemuri for (;;) { 4837c478bd9Sstevel@tonic-gate if ((old = lp->rw_wwwh) & RW_WRITE_CLAIMED) { 4847c478bd9Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 4857c478bd9Sstevel@tonic-gate return (0); 4867c478bd9Sstevel@tonic-gate } 487*374ae87fSsvemuri if (casip(&lp->rw_wwwh, old, old + RW_READ_LOCK) == old) 488*374ae87fSsvemuri break; 489*374ae87fSsvemuri if (rw_lock_delay != NULL) { 490*374ae87fSsvemuri backoff = rw_lock_backoff(backoff); 491*374ae87fSsvemuri rw_lock_delay(backoff); 492*374ae87fSsvemuri if (++loop_count == ncpus_online) { 493*374ae87fSsvemuri backoff = 0; 494*374ae87fSsvemuri loop_count = 0; 495*374ae87fSsvemuri } 496*374ae87fSsvemuri } 497*374ae87fSsvemuri } 4987c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE, lp, rw); 4997c478bd9Sstevel@tonic-gate } else { 5007c478bd9Sstevel@tonic-gate if (casip(&lp->rw_wwwh, 0, RW_WRITE_LOCK(curthread)) != 0) 5017c478bd9Sstevel@tonic-gate return (0); 5027c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE, lp, rw); 5037c478bd9Sstevel@tonic-gate } 5047c478bd9Sstevel@tonic-gate ASSERT(rw_locked(lp, rw)); 5057c478bd9Sstevel@tonic-gate membar_enter(); 5067c478bd9Sstevel@tonic-gate return (1); 5077c478bd9Sstevel@tonic-gate } 5087c478bd9Sstevel@tonic-gate 5097c478bd9Sstevel@tonic-gate void 5107c478bd9Sstevel@tonic-gate rw_downgrade(krwlock_t *rwlp) 5117c478bd9Sstevel@tonic-gate { 5127c478bd9Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 5137c478bd9Sstevel@tonic-gate 5147c478bd9Sstevel@tonic-gate THREAD_KPRI_REQUEST(); 5157c478bd9Sstevel@tonic-gate membar_exit(); 5167c478bd9Sstevel@tonic-gate 5177c478bd9Sstevel@tonic-gate if ((lp->rw_wwwh & RW_OWNER) != (uintptr_t)curthread) { 5187c478bd9Sstevel@tonic-gate rw_panic("rw_downgrade: not owner", lp); 5197c478bd9Sstevel@tonic-gate return; 5207c478bd9Sstevel@tonic-gate } 5217c478bd9Sstevel@tonic-gate 5227c478bd9Sstevel@tonic-gate if (atomic_add_ip_nv(&lp->rw_wwwh, 5237c478bd9Sstevel@tonic-gate RW_READ_LOCK - RW_WRITE_LOCK(curthread)) & RW_HAS_WAITERS) { 5247c478bd9Sstevel@tonic-gate turnstile_t *ts = turnstile_lookup(lp); 5257c478bd9Sstevel@tonic-gate int nreaders = rw_readers_to_wake(ts); 5267c478bd9Sstevel@tonic-gate if (nreaders > 0) { 5277c478bd9Sstevel@tonic-gate uintptr_t delta = nreaders * RW_READ_LOCK; 5287c478bd9Sstevel@tonic-gate if (ts->ts_waiters == nreaders) 5297c478bd9Sstevel@tonic-gate delta -= RW_HAS_WAITERS; 5307c478bd9Sstevel@tonic-gate atomic_add_ip(&lp->rw_wwwh, delta); 5317c478bd9Sstevel@tonic-gate } 5327c478bd9Sstevel@tonic-gate turnstile_wakeup(ts, TS_READER_Q, nreaders, NULL); 5337c478bd9Sstevel@tonic-gate } 5347c478bd9Sstevel@tonic-gate ASSERT(rw_locked(lp, RW_READER)); 5357c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_RW_DOWNGRADE_DOWNGRADE, lp); 5367c478bd9Sstevel@tonic-gate } 5377c478bd9Sstevel@tonic-gate 5387c478bd9Sstevel@tonic-gate int 5397c478bd9Sstevel@tonic-gate rw_tryupgrade(krwlock_t *rwlp) 5407c478bd9Sstevel@tonic-gate { 5417c478bd9Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 5427c478bd9Sstevel@tonic-gate uintptr_t old, new; 5437c478bd9Sstevel@tonic-gate 5447c478bd9Sstevel@tonic-gate ASSERT(rw_locked(lp, RW_READER)); 5457c478bd9Sstevel@tonic-gate 5467c478bd9Sstevel@tonic-gate do { 5477c478bd9Sstevel@tonic-gate if (((old = lp->rw_wwwh) & ~RW_HAS_WAITERS) != RW_READ_LOCK) 5487c478bd9Sstevel@tonic-gate return (0); 5497c478bd9Sstevel@tonic-gate new = old + RW_WRITE_LOCK(curthread) - RW_READ_LOCK; 5507c478bd9Sstevel@tonic-gate } while (casip(&lp->rw_wwwh, old, new) != old); 5517c478bd9Sstevel@tonic-gate 5527c478bd9Sstevel@tonic-gate membar_enter(); 5537c478bd9Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 5547c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_RW_TRYUPGRADE_UPGRADE, lp); 5557c478bd9Sstevel@tonic-gate ASSERT(rw_locked(lp, RW_WRITER)); 5567c478bd9Sstevel@tonic-gate return (1); 5577c478bd9Sstevel@tonic-gate } 5587c478bd9Sstevel@tonic-gate 5597c478bd9Sstevel@tonic-gate int 5607c478bd9Sstevel@tonic-gate rw_read_held(krwlock_t *rwlp) 5617c478bd9Sstevel@tonic-gate { 5627c478bd9Sstevel@tonic-gate uintptr_t tmp; 5637c478bd9Sstevel@tonic-gate 5647c478bd9Sstevel@tonic-gate return (_RW_READ_HELD(rwlp, tmp)); 5657c478bd9Sstevel@tonic-gate } 5667c478bd9Sstevel@tonic-gate 5677c478bd9Sstevel@tonic-gate int 5687c478bd9Sstevel@tonic-gate rw_write_held(krwlock_t *rwlp) 5697c478bd9Sstevel@tonic-gate { 5707c478bd9Sstevel@tonic-gate return (_RW_WRITE_HELD(rwlp)); 5717c478bd9Sstevel@tonic-gate } 5727c478bd9Sstevel@tonic-gate 5737c478bd9Sstevel@tonic-gate int 5747c478bd9Sstevel@tonic-gate rw_lock_held(krwlock_t *rwlp) 5757c478bd9Sstevel@tonic-gate { 5767c478bd9Sstevel@tonic-gate return (_RW_LOCK_HELD(rwlp)); 5777c478bd9Sstevel@tonic-gate } 5787c478bd9Sstevel@tonic-gate 5797c478bd9Sstevel@tonic-gate /* 5807c478bd9Sstevel@tonic-gate * Like rw_read_held(), but ASSERTs that the lock is currently held 5817c478bd9Sstevel@tonic-gate */ 5827c478bd9Sstevel@tonic-gate int 5837c478bd9Sstevel@tonic-gate rw_read_locked(krwlock_t *rwlp) 5847c478bd9Sstevel@tonic-gate { 5857c478bd9Sstevel@tonic-gate uintptr_t old = ((rwlock_impl_t *)rwlp)->rw_wwwh; 5867c478bd9Sstevel@tonic-gate 5877c478bd9Sstevel@tonic-gate ASSERT(old & RW_LOCKED); 5887c478bd9Sstevel@tonic-gate return ((old & RW_LOCKED) && !(old & RW_WRITE_LOCKED)); 5897c478bd9Sstevel@tonic-gate } 5907c478bd9Sstevel@tonic-gate 5917c478bd9Sstevel@tonic-gate /* 5927c478bd9Sstevel@tonic-gate * Returns non-zero if the lock is either held or desired by a writer 5937c478bd9Sstevel@tonic-gate */ 5947c478bd9Sstevel@tonic-gate int 5957c478bd9Sstevel@tonic-gate rw_iswriter(krwlock_t *rwlp) 5967c478bd9Sstevel@tonic-gate { 5977c478bd9Sstevel@tonic-gate return (_RW_ISWRITER(rwlp)); 5987c478bd9Sstevel@tonic-gate } 5997c478bd9Sstevel@tonic-gate 6007c478bd9Sstevel@tonic-gate kthread_t * 6017c478bd9Sstevel@tonic-gate rw_owner(krwlock_t *rwlp) 6027c478bd9Sstevel@tonic-gate { 6037c478bd9Sstevel@tonic-gate uintptr_t old = ((rwlock_impl_t *)rwlp)->rw_wwwh; 6047c478bd9Sstevel@tonic-gate 6057c478bd9Sstevel@tonic-gate return ((old & RW_WRITE_LOCKED) ? (kthread_t *)(old & RW_OWNER) : NULL); 6067c478bd9Sstevel@tonic-gate } 607