17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5374ae87fSsvemuri * Common Development and Distribution License (the "License"). 6374ae87fSsvemuri * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 217c478bd9Sstevel@tonic-gate /* 22374ae87fSsvemuri * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 237c478bd9Sstevel@tonic-gate * Use is subject to license terms. 247c478bd9Sstevel@tonic-gate */ 257c478bd9Sstevel@tonic-gate 267c478bd9Sstevel@tonic-gate #include <sys/param.h> 277c478bd9Sstevel@tonic-gate #include <sys/thread.h> 287c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 297c478bd9Sstevel@tonic-gate #include <sys/debug.h> 307c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h> 317c478bd9Sstevel@tonic-gate #include <sys/sobject.h> 327c478bd9Sstevel@tonic-gate #include <sys/turnstile.h> 337c478bd9Sstevel@tonic-gate #include <sys/rwlock.h> 347c478bd9Sstevel@tonic-gate #include <sys/rwlock_impl.h> 357c478bd9Sstevel@tonic-gate #include <sys/atomic.h> 367c478bd9Sstevel@tonic-gate #include <sys/lockstat.h> 377c478bd9Sstevel@tonic-gate 387c478bd9Sstevel@tonic-gate /* 397c478bd9Sstevel@tonic-gate * Big Theory Statement for readers/writer locking primitives. 407c478bd9Sstevel@tonic-gate * 417c478bd9Sstevel@tonic-gate * An rwlock provides exclusive access to a single thread ("writer") or 427c478bd9Sstevel@tonic-gate * concurrent access to multiple threads ("readers"). See rwlock(9F) 437c478bd9Sstevel@tonic-gate * for a full description of the interfaces and programming model. 447c478bd9Sstevel@tonic-gate * The rest of this comment describes the implementation. 457c478bd9Sstevel@tonic-gate * 467c478bd9Sstevel@tonic-gate * An rwlock is a single word with the following structure: 477c478bd9Sstevel@tonic-gate * 487c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------- 497c478bd9Sstevel@tonic-gate * | OWNER (writer) or HOLD COUNT (readers) | WRLOCK | WRWANT | WAIT | 507c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------- 517c478bd9Sstevel@tonic-gate * 63 / 31 .. 3 2 1 0 527c478bd9Sstevel@tonic-gate * 537c478bd9Sstevel@tonic-gate * The waiters bit (0) indicates whether any threads are blocked waiting 547c478bd9Sstevel@tonic-gate * for the lock. The write-wanted bit (1) indicates whether any threads 557c478bd9Sstevel@tonic-gate * are blocked waiting for write access. The write-locked bit (2) indicates 567c478bd9Sstevel@tonic-gate * whether the lock is held by a writer, which determines whether the upper 577c478bd9Sstevel@tonic-gate * bits (3..31 in ILP32, 3..63 in LP64) should be interpreted as the owner 587c478bd9Sstevel@tonic-gate * (thread pointer) or the hold count (number of readers). 597c478bd9Sstevel@tonic-gate * 607c478bd9Sstevel@tonic-gate * In the absence of any contention, a writer gets the lock by setting 617c478bd9Sstevel@tonic-gate * this word to (curthread | RW_WRITE_LOCKED); a reader gets the lock 627c478bd9Sstevel@tonic-gate * by incrementing the hold count (i.e. adding 8, aka RW_READ_LOCK). 637c478bd9Sstevel@tonic-gate * 647c478bd9Sstevel@tonic-gate * A writer will fail to acquire the lock if any other thread owns it. 657c478bd9Sstevel@tonic-gate * A reader will fail if the lock is either owned or wanted by a writer. 667c478bd9Sstevel@tonic-gate * rw_tryenter() returns 0 in these cases; rw_enter() blocks until the 677c478bd9Sstevel@tonic-gate * lock becomes available. 687c478bd9Sstevel@tonic-gate * 697c478bd9Sstevel@tonic-gate * When a thread blocks it acquires the rwlock's hashed turnstile lock and 707c478bd9Sstevel@tonic-gate * attempts to set RW_HAS_WAITERS (and RW_WRITE_WANTED in the writer case) 717c478bd9Sstevel@tonic-gate * atomically *only if the lock still appears busy*. A thread must never 727c478bd9Sstevel@tonic-gate * accidentally block for an available lock since there would be no owner 737c478bd9Sstevel@tonic-gate * to awaken it. casip() provides the required atomicity. Once casip() 747c478bd9Sstevel@tonic-gate * succeeds, the decision to block becomes final and irreversible. The 757c478bd9Sstevel@tonic-gate * thread will not become runnable again until it has been granted ownership 767c478bd9Sstevel@tonic-gate * of the lock via direct handoff from a former owner as described below. 777c478bd9Sstevel@tonic-gate * 787c478bd9Sstevel@tonic-gate * In the absence of any waiters, rw_exit() just clears the lock (if it 797c478bd9Sstevel@tonic-gate * is write-locked) or decrements the hold count (if it is read-locked). 807c478bd9Sstevel@tonic-gate * Note that even if waiters are present, decrementing the hold count 817c478bd9Sstevel@tonic-gate * to a non-zero value requires no special action since the lock is still 827c478bd9Sstevel@tonic-gate * held by at least one other thread. 837c478bd9Sstevel@tonic-gate * 847c478bd9Sstevel@tonic-gate * On the "final exit" (transition to unheld state) of a lock with waiters, 857c478bd9Sstevel@tonic-gate * rw_exit_wakeup() grabs the turnstile lock and transfers ownership directly 867c478bd9Sstevel@tonic-gate * to the next writer or set of readers. There are several advantages to this 877c478bd9Sstevel@tonic-gate * approach: (1) it closes all windows for priority inversion (when a new 887c478bd9Sstevel@tonic-gate * writer has grabbed the lock but has not yet inherited from blocked readers); 897c478bd9Sstevel@tonic-gate * (2) it prevents starvation of equal-priority threads by granting the lock 907c478bd9Sstevel@tonic-gate * in FIFO order; (3) it eliminates the need for a write-wanted count -- a 917c478bd9Sstevel@tonic-gate * single bit suffices because the lock remains held until all waiting 927c478bd9Sstevel@tonic-gate * writers are gone; (4) when we awaken N readers we can perform a single 937c478bd9Sstevel@tonic-gate * "atomic_add(&x, N)" to set the total hold count rather than having all N 947c478bd9Sstevel@tonic-gate * threads fight for the cache to perform an "atomic_add(&x, 1)" upon wakeup. 957c478bd9Sstevel@tonic-gate * 967c478bd9Sstevel@tonic-gate * The most interesting policy decision in rw_exit_wakeup() is which thread 977c478bd9Sstevel@tonic-gate * to wake. Starvation is always possible with priority-based scheduling, 987c478bd9Sstevel@tonic-gate * but any sane wakeup policy should at least satisfy these requirements: 997c478bd9Sstevel@tonic-gate * 1007c478bd9Sstevel@tonic-gate * (1) The highest-priority thread in the system should not starve. 1017c478bd9Sstevel@tonic-gate * (2) The highest-priority writer should not starve. 1027c478bd9Sstevel@tonic-gate * (3) No writer should starve due to lower-priority threads. 1037c478bd9Sstevel@tonic-gate * (4) No reader should starve due to lower-priority writers. 1047c478bd9Sstevel@tonic-gate * (5) If all threads have equal priority, none of them should starve. 1057c478bd9Sstevel@tonic-gate * 1067c478bd9Sstevel@tonic-gate * We used to employ a writers-always-win policy, which doesn't even 1077c478bd9Sstevel@tonic-gate * satisfy (1): a steady stream of low-priority writers can starve out 1087c478bd9Sstevel@tonic-gate * a real-time reader! This is clearly a broken policy -- it violates 1097c478bd9Sstevel@tonic-gate * (1), (4), and (5) -- but it's how rwlocks always used to behave. 1107c478bd9Sstevel@tonic-gate * 1117c478bd9Sstevel@tonic-gate * A round-robin policy (exiting readers grant the lock to blocked writers 1127c478bd9Sstevel@tonic-gate * and vice versa) satisfies all but (3): a single high-priority writer 1137c478bd9Sstevel@tonic-gate * and many low-priority readers can starve out medium-priority writers. 1147c478bd9Sstevel@tonic-gate * 1157c478bd9Sstevel@tonic-gate * A strict priority policy (grant the lock to the highest priority blocked 1167c478bd9Sstevel@tonic-gate * thread) satisfies everything but (2): a steady stream of high-priority 1177c478bd9Sstevel@tonic-gate * readers can permanently starve the highest-priority writer. 1187c478bd9Sstevel@tonic-gate * 1197c478bd9Sstevel@tonic-gate * The reason we care about (2) is that it's important to process writers 1207c478bd9Sstevel@tonic-gate * reasonably quickly -- even if they're low priority -- because their very 1217c478bd9Sstevel@tonic-gate * presence causes all readers to take the slow (blocking) path through this 1227c478bd9Sstevel@tonic-gate * code. There is also a general sense that writers deserve some degree of 1237c478bd9Sstevel@tonic-gate * deference because they're updating the data upon which all readers act. 1247c478bd9Sstevel@tonic-gate * Presumably this data should not be allowed to become arbitrarily stale 1257c478bd9Sstevel@tonic-gate * due to writer starvation. Finally, it seems reasonable to level the 1267c478bd9Sstevel@tonic-gate * playing field a bit to compensate for the fact that it's so much harder 1277c478bd9Sstevel@tonic-gate * for a writer to get in when there are already many readers present. 1287c478bd9Sstevel@tonic-gate * 1297c478bd9Sstevel@tonic-gate * A hybrid of round-robin and strict priority can be made to satisfy 1307c478bd9Sstevel@tonic-gate * all five criteria. In this "writer priority policy" exiting readers 1317c478bd9Sstevel@tonic-gate * always grant the lock to waiting writers, but exiting writers only 1327c478bd9Sstevel@tonic-gate * grant the lock to readers of the same or higher priority than the 1337c478bd9Sstevel@tonic-gate * highest-priority blocked writer. Thus requirement (2) is satisfied, 1347c478bd9Sstevel@tonic-gate * necessarily, by a willful act of priority inversion: an exiting reader 1357c478bd9Sstevel@tonic-gate * will grant the lock to a blocked writer even if there are blocked 1367c478bd9Sstevel@tonic-gate * readers of higher priority. The situation is mitigated by the fact 1377c478bd9Sstevel@tonic-gate * that writers always inherit priority from blocked readers, and the 1387c478bd9Sstevel@tonic-gate * writer will awaken those readers as soon as it exits the lock. 1397c478bd9Sstevel@tonic-gate * 1407c478bd9Sstevel@tonic-gate * rw_downgrade() follows the same wakeup policy as an exiting writer. 1417c478bd9Sstevel@tonic-gate * 1427c478bd9Sstevel@tonic-gate * rw_tryupgrade() has the same failure mode as rw_tryenter() for a 1437c478bd9Sstevel@tonic-gate * write lock. Both honor the WRITE_WANTED bit by specification. 1447c478bd9Sstevel@tonic-gate * 1457c478bd9Sstevel@tonic-gate * The following rules apply to manipulation of rwlock internal state: 1467c478bd9Sstevel@tonic-gate * 1477c478bd9Sstevel@tonic-gate * (1) The rwlock is only modified via the atomic primitives casip() 1487c478bd9Sstevel@tonic-gate * and atomic_add_ip(). 1497c478bd9Sstevel@tonic-gate * 1507c478bd9Sstevel@tonic-gate * (2) The waiters bit and write-wanted bit are only modified under 1517c478bd9Sstevel@tonic-gate * turnstile_lookup(). This ensures that the turnstile is consistent 1527c478bd9Sstevel@tonic-gate * with the rwlock. 1537c478bd9Sstevel@tonic-gate * 1547c478bd9Sstevel@tonic-gate * (3) Waiters receive the lock by direct handoff from the previous 1557c478bd9Sstevel@tonic-gate * owner. Therefore, waiters *always* wake up holding the lock. 1567c478bd9Sstevel@tonic-gate */ 1577c478bd9Sstevel@tonic-gate 1587c478bd9Sstevel@tonic-gate /* 1597c478bd9Sstevel@tonic-gate * The sobj_ops vector exports a set of functions needed when a thread 1607c478bd9Sstevel@tonic-gate * is asleep on a synchronization object of a given type. 1617c478bd9Sstevel@tonic-gate */ 1627c478bd9Sstevel@tonic-gate static sobj_ops_t rw_sobj_ops = { 1637c478bd9Sstevel@tonic-gate SOBJ_RWLOCK, rw_owner, turnstile_stay_asleep, turnstile_change_pri 1647c478bd9Sstevel@tonic-gate }; 1657c478bd9Sstevel@tonic-gate 1667c478bd9Sstevel@tonic-gate /* 1677c478bd9Sstevel@tonic-gate * If the system panics on an rwlock, save the address of the offending 1687c478bd9Sstevel@tonic-gate * rwlock in panic_rwlock_addr, and save the contents in panic_rwlock. 1697c478bd9Sstevel@tonic-gate */ 1707c478bd9Sstevel@tonic-gate static rwlock_impl_t panic_rwlock; 1717c478bd9Sstevel@tonic-gate static rwlock_impl_t *panic_rwlock_addr; 1727c478bd9Sstevel@tonic-gate 1737c478bd9Sstevel@tonic-gate static void 1747c478bd9Sstevel@tonic-gate rw_panic(char *msg, rwlock_impl_t *lp) 1757c478bd9Sstevel@tonic-gate { 1767c478bd9Sstevel@tonic-gate if (panicstr) 1777c478bd9Sstevel@tonic-gate return; 1787c478bd9Sstevel@tonic-gate 1797c478bd9Sstevel@tonic-gate if (casptr(&panic_rwlock_addr, NULL, lp) == NULL) 1807c478bd9Sstevel@tonic-gate panic_rwlock = *lp; 1817c478bd9Sstevel@tonic-gate 1827c478bd9Sstevel@tonic-gate panic("%s, lp=%p wwwh=%lx thread=%p", 183*8793b36bSNick Todd msg, (void *)lp, panic_rwlock.rw_wwwh, (void *)curthread); 1847c478bd9Sstevel@tonic-gate } 1857c478bd9Sstevel@tonic-gate 1867c478bd9Sstevel@tonic-gate /* ARGSUSED */ 1877c478bd9Sstevel@tonic-gate void 1887c478bd9Sstevel@tonic-gate rw_init(krwlock_t *rwlp, char *name, krw_type_t type, void *arg) 1897c478bd9Sstevel@tonic-gate { 1907c478bd9Sstevel@tonic-gate ((rwlock_impl_t *)rwlp)->rw_wwwh = 0; 1917c478bd9Sstevel@tonic-gate } 1927c478bd9Sstevel@tonic-gate 1937c478bd9Sstevel@tonic-gate void 1947c478bd9Sstevel@tonic-gate rw_destroy(krwlock_t *rwlp) 1957c478bd9Sstevel@tonic-gate { 1967c478bd9Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 1977c478bd9Sstevel@tonic-gate 1987c478bd9Sstevel@tonic-gate if (lp->rw_wwwh != 0) { 1997c478bd9Sstevel@tonic-gate if ((lp->rw_wwwh & RW_DOUBLE_LOCK) == RW_DOUBLE_LOCK) 2007c478bd9Sstevel@tonic-gate rw_panic("rw_destroy: lock already destroyed", lp); 2017c478bd9Sstevel@tonic-gate else 2027c478bd9Sstevel@tonic-gate rw_panic("rw_destroy: lock still active", lp); 2037c478bd9Sstevel@tonic-gate } 2047c478bd9Sstevel@tonic-gate 2057c478bd9Sstevel@tonic-gate lp->rw_wwwh = RW_DOUBLE_LOCK; 2067c478bd9Sstevel@tonic-gate } 2077c478bd9Sstevel@tonic-gate 2087c478bd9Sstevel@tonic-gate /* 2097c478bd9Sstevel@tonic-gate * Verify that an rwlock is held correctly. 2107c478bd9Sstevel@tonic-gate */ 2117c478bd9Sstevel@tonic-gate static int 2127c478bd9Sstevel@tonic-gate rw_locked(rwlock_impl_t *lp, krw_t rw) 2137c478bd9Sstevel@tonic-gate { 2147c478bd9Sstevel@tonic-gate uintptr_t old = lp->rw_wwwh; 2157c478bd9Sstevel@tonic-gate 2167c478bd9Sstevel@tonic-gate if (rw == RW_READER) 2177c478bd9Sstevel@tonic-gate return ((old & RW_LOCKED) && !(old & RW_WRITE_LOCKED)); 2187c478bd9Sstevel@tonic-gate 2197c478bd9Sstevel@tonic-gate if (rw == RW_WRITER) 2207c478bd9Sstevel@tonic-gate return ((old & RW_OWNER) == (uintptr_t)curthread); 2217c478bd9Sstevel@tonic-gate 2227c478bd9Sstevel@tonic-gate return (0); 2237c478bd9Sstevel@tonic-gate } 2247c478bd9Sstevel@tonic-gate 225374ae87fSsvemuri uint_t (*rw_lock_backoff)(uint_t) = NULL; 226374ae87fSsvemuri void (*rw_lock_delay)(uint_t) = NULL; 227374ae87fSsvemuri 2287c478bd9Sstevel@tonic-gate /* 2297c478bd9Sstevel@tonic-gate * Full-service implementation of rw_enter() to handle all the hard cases. 2307c478bd9Sstevel@tonic-gate * Called from the assembly version if anything complicated is going on. 2317c478bd9Sstevel@tonic-gate * The only semantic difference between calling rw_enter() and calling 2327c478bd9Sstevel@tonic-gate * rw_enter_sleep() directly is that we assume the caller has already done 2337c478bd9Sstevel@tonic-gate * a THREAD_KPRI_REQUEST() in the RW_READER case. 2347c478bd9Sstevel@tonic-gate */ 2357c478bd9Sstevel@tonic-gate void 2367c478bd9Sstevel@tonic-gate rw_enter_sleep(rwlock_impl_t *lp, krw_t rw) 2377c478bd9Sstevel@tonic-gate { 2387c478bd9Sstevel@tonic-gate uintptr_t old, new, lock_value, lock_busy, lock_wait; 2397c478bd9Sstevel@tonic-gate hrtime_t sleep_time; 2407c478bd9Sstevel@tonic-gate turnstile_t *ts; 241374ae87fSsvemuri uint_t backoff = 0; 242374ae87fSsvemuri int loop_count = 0; 2437c478bd9Sstevel@tonic-gate 2447c478bd9Sstevel@tonic-gate if (rw == RW_READER) { 2457c478bd9Sstevel@tonic-gate lock_value = RW_READ_LOCK; 2467c478bd9Sstevel@tonic-gate lock_busy = RW_WRITE_CLAIMED; 2477c478bd9Sstevel@tonic-gate lock_wait = RW_HAS_WAITERS; 2487c478bd9Sstevel@tonic-gate } else { 2497c478bd9Sstevel@tonic-gate lock_value = RW_WRITE_LOCK(curthread); 2507c478bd9Sstevel@tonic-gate lock_busy = (uintptr_t)RW_LOCKED; 2517c478bd9Sstevel@tonic-gate lock_wait = RW_HAS_WAITERS | RW_WRITE_WANTED; 2527c478bd9Sstevel@tonic-gate } 2537c478bd9Sstevel@tonic-gate 2547c478bd9Sstevel@tonic-gate for (;;) { 2557c478bd9Sstevel@tonic-gate if (((old = lp->rw_wwwh) & lock_busy) == 0) { 256374ae87fSsvemuri if (casip(&lp->rw_wwwh, old, old + lock_value) != old) { 257374ae87fSsvemuri if (rw_lock_delay != NULL) { 258374ae87fSsvemuri backoff = rw_lock_backoff(backoff); 259374ae87fSsvemuri rw_lock_delay(backoff); 260374ae87fSsvemuri if (++loop_count == ncpus_online) { 261374ae87fSsvemuri backoff = 0; 262374ae87fSsvemuri loop_count = 0; 263374ae87fSsvemuri } 264374ae87fSsvemuri } 2657c478bd9Sstevel@tonic-gate continue; 266374ae87fSsvemuri } 2677c478bd9Sstevel@tonic-gate break; 2687c478bd9Sstevel@tonic-gate } 2697c478bd9Sstevel@tonic-gate 2707c478bd9Sstevel@tonic-gate if (panicstr) 2717c478bd9Sstevel@tonic-gate return; 2727c478bd9Sstevel@tonic-gate 2737c478bd9Sstevel@tonic-gate if ((old & RW_DOUBLE_LOCK) == RW_DOUBLE_LOCK) { 2747c478bd9Sstevel@tonic-gate rw_panic("rw_enter: bad rwlock", lp); 2757c478bd9Sstevel@tonic-gate return; 2767c478bd9Sstevel@tonic-gate } 2777c478bd9Sstevel@tonic-gate 2787c478bd9Sstevel@tonic-gate if ((old & RW_OWNER) == (uintptr_t)curthread) { 2797c478bd9Sstevel@tonic-gate rw_panic("recursive rw_enter", lp); 2807c478bd9Sstevel@tonic-gate return; 2817c478bd9Sstevel@tonic-gate } 2827c478bd9Sstevel@tonic-gate 2837c478bd9Sstevel@tonic-gate ts = turnstile_lookup(lp); 2847c478bd9Sstevel@tonic-gate 2857c478bd9Sstevel@tonic-gate do { 2867c478bd9Sstevel@tonic-gate if (((old = lp->rw_wwwh) & lock_busy) == 0) 2877c478bd9Sstevel@tonic-gate break; 2887c478bd9Sstevel@tonic-gate new = old | lock_wait; 2897c478bd9Sstevel@tonic-gate } while (old != new && casip(&lp->rw_wwwh, old, new) != old); 2907c478bd9Sstevel@tonic-gate 2917c478bd9Sstevel@tonic-gate if ((old & lock_busy) == 0) { 2927c478bd9Sstevel@tonic-gate /* 2937c478bd9Sstevel@tonic-gate * The lock appears free now; try the dance again 2947c478bd9Sstevel@tonic-gate */ 2957c478bd9Sstevel@tonic-gate turnstile_exit(lp); 2967c478bd9Sstevel@tonic-gate continue; 2977c478bd9Sstevel@tonic-gate } 2987c478bd9Sstevel@tonic-gate 2997c478bd9Sstevel@tonic-gate /* 3007c478bd9Sstevel@tonic-gate * We really are going to block. Bump the stats, and drop 3017c478bd9Sstevel@tonic-gate * kpri if we're a reader. 3027c478bd9Sstevel@tonic-gate */ 3037c478bd9Sstevel@tonic-gate ASSERT(lp->rw_wwwh & lock_wait); 3047c478bd9Sstevel@tonic-gate ASSERT(lp->rw_wwwh & RW_LOCKED); 3057c478bd9Sstevel@tonic-gate 3067c478bd9Sstevel@tonic-gate sleep_time = -gethrtime(); 3077c478bd9Sstevel@tonic-gate if (rw == RW_READER) { 3087c478bd9Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 3097c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(CPU, sys, rw_rdfails, 1); 3107c478bd9Sstevel@tonic-gate (void) turnstile_block(ts, TS_READER_Q, lp, 3117c478bd9Sstevel@tonic-gate &rw_sobj_ops, NULL, NULL); 3127c478bd9Sstevel@tonic-gate } else { 3137c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(CPU, sys, rw_wrfails, 1); 3147c478bd9Sstevel@tonic-gate (void) turnstile_block(ts, TS_WRITER_Q, lp, 3157c478bd9Sstevel@tonic-gate &rw_sobj_ops, NULL, NULL); 3167c478bd9Sstevel@tonic-gate } 3177c478bd9Sstevel@tonic-gate sleep_time += gethrtime(); 3187c478bd9Sstevel@tonic-gate 3197c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD4(LS_RW_ENTER_BLOCK, lp, sleep_time, rw, 3207c478bd9Sstevel@tonic-gate (old & RW_WRITE_LOCKED) ? 1 : 0, 3217c478bd9Sstevel@tonic-gate old >> RW_HOLD_COUNT_SHIFT); 3227c478bd9Sstevel@tonic-gate 3237c478bd9Sstevel@tonic-gate /* 3247c478bd9Sstevel@tonic-gate * We wake up holding the lock (and having kpri if we're 3257c478bd9Sstevel@tonic-gate * a reader) via direct handoff from the previous owner. 3267c478bd9Sstevel@tonic-gate */ 3277c478bd9Sstevel@tonic-gate break; 3287c478bd9Sstevel@tonic-gate } 3297c478bd9Sstevel@tonic-gate 3307c478bd9Sstevel@tonic-gate ASSERT(rw_locked(lp, rw)); 3317c478bd9Sstevel@tonic-gate 3327c478bd9Sstevel@tonic-gate membar_enter(); 3337c478bd9Sstevel@tonic-gate 3347c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_ENTER_ACQUIRE, lp, rw); 3357c478bd9Sstevel@tonic-gate } 3367c478bd9Sstevel@tonic-gate 3377c478bd9Sstevel@tonic-gate /* 3387c478bd9Sstevel@tonic-gate * Return the number of readers to wake, or zero if we should wake a writer. 3397c478bd9Sstevel@tonic-gate * Called only by exiting/downgrading writers (readers don't wake readers). 3407c478bd9Sstevel@tonic-gate */ 3417c478bd9Sstevel@tonic-gate static int 3427c478bd9Sstevel@tonic-gate rw_readers_to_wake(turnstile_t *ts) 3437c478bd9Sstevel@tonic-gate { 3447c478bd9Sstevel@tonic-gate kthread_t *next_writer = ts->ts_sleepq[TS_WRITER_Q].sq_first; 3457c478bd9Sstevel@tonic-gate kthread_t *next_reader = ts->ts_sleepq[TS_READER_Q].sq_first; 3467c478bd9Sstevel@tonic-gate pri_t wpri = (next_writer != NULL) ? DISP_PRIO(next_writer) : -1; 3477c478bd9Sstevel@tonic-gate int count = 0; 3487c478bd9Sstevel@tonic-gate 3497c478bd9Sstevel@tonic-gate while (next_reader != NULL) { 3507c478bd9Sstevel@tonic-gate if (DISP_PRIO(next_reader) < wpri) 3517c478bd9Sstevel@tonic-gate break; 3527c478bd9Sstevel@tonic-gate next_reader->t_kpri_req++; 3537c478bd9Sstevel@tonic-gate next_reader = next_reader->t_link; 3547c478bd9Sstevel@tonic-gate count++; 3557c478bd9Sstevel@tonic-gate } 3567c478bd9Sstevel@tonic-gate return (count); 3577c478bd9Sstevel@tonic-gate } 3587c478bd9Sstevel@tonic-gate 3597c478bd9Sstevel@tonic-gate /* 3607c478bd9Sstevel@tonic-gate * Full-service implementation of rw_exit() to handle all the hard cases. 3617c478bd9Sstevel@tonic-gate * Called from the assembly version if anything complicated is going on. 3627c478bd9Sstevel@tonic-gate * There is no semantic difference between calling rw_exit() and calling 3637c478bd9Sstevel@tonic-gate * rw_exit_wakeup() directly. 3647c478bd9Sstevel@tonic-gate */ 3657c478bd9Sstevel@tonic-gate void 3667c478bd9Sstevel@tonic-gate rw_exit_wakeup(rwlock_impl_t *lp) 3677c478bd9Sstevel@tonic-gate { 3687c478bd9Sstevel@tonic-gate turnstile_t *ts; 3697c478bd9Sstevel@tonic-gate uintptr_t old, new, lock_value; 3707c478bd9Sstevel@tonic-gate kthread_t *next_writer; 3717c478bd9Sstevel@tonic-gate int nreaders; 372374ae87fSsvemuri uint_t backoff = 0; 373374ae87fSsvemuri int loop_count = 0; 3747c478bd9Sstevel@tonic-gate 3757c478bd9Sstevel@tonic-gate membar_exit(); 3767c478bd9Sstevel@tonic-gate 3777c478bd9Sstevel@tonic-gate old = lp->rw_wwwh; 3787c478bd9Sstevel@tonic-gate if (old & RW_WRITE_LOCKED) { 3797c478bd9Sstevel@tonic-gate if ((old & RW_OWNER) != (uintptr_t)curthread) { 3807c478bd9Sstevel@tonic-gate rw_panic("rw_exit: not owner", lp); 3817c478bd9Sstevel@tonic-gate lp->rw_wwwh = 0; 3827c478bd9Sstevel@tonic-gate return; 3837c478bd9Sstevel@tonic-gate } 3847c478bd9Sstevel@tonic-gate lock_value = RW_WRITE_LOCK(curthread); 3857c478bd9Sstevel@tonic-gate } else { 3867c478bd9Sstevel@tonic-gate if ((old & RW_LOCKED) == 0) { 3877c478bd9Sstevel@tonic-gate rw_panic("rw_exit: lock not held", lp); 3887c478bd9Sstevel@tonic-gate return; 3897c478bd9Sstevel@tonic-gate } 3907c478bd9Sstevel@tonic-gate lock_value = RW_READ_LOCK; 3917c478bd9Sstevel@tonic-gate } 3927c478bd9Sstevel@tonic-gate 3937c478bd9Sstevel@tonic-gate for (;;) { 3947c478bd9Sstevel@tonic-gate /* 3957c478bd9Sstevel@tonic-gate * If this is *not* the final exit of a lock with waiters, 3967c478bd9Sstevel@tonic-gate * just drop the lock -- there's nothing tricky going on. 3977c478bd9Sstevel@tonic-gate */ 3987c478bd9Sstevel@tonic-gate old = lp->rw_wwwh; 3997c478bd9Sstevel@tonic-gate new = old - lock_value; 4007c478bd9Sstevel@tonic-gate if ((new & (RW_LOCKED | RW_HAS_WAITERS)) != RW_HAS_WAITERS) { 401374ae87fSsvemuri if (casip(&lp->rw_wwwh, old, new) != old) { 402374ae87fSsvemuri if (rw_lock_delay != NULL) { 403374ae87fSsvemuri backoff = rw_lock_backoff(backoff); 404374ae87fSsvemuri rw_lock_delay(backoff); 405374ae87fSsvemuri if (++loop_count == ncpus_online) { 406374ae87fSsvemuri backoff = 0; 407374ae87fSsvemuri loop_count = 0; 408374ae87fSsvemuri } 409374ae87fSsvemuri } 4107c478bd9Sstevel@tonic-gate continue; 411374ae87fSsvemuri } 4127c478bd9Sstevel@tonic-gate break; 4137c478bd9Sstevel@tonic-gate } 4147c478bd9Sstevel@tonic-gate 4157c478bd9Sstevel@tonic-gate /* 4167c478bd9Sstevel@tonic-gate * Perform the final exit of a lock that has waiters. 4177c478bd9Sstevel@tonic-gate */ 4187c478bd9Sstevel@tonic-gate ts = turnstile_lookup(lp); 4197c478bd9Sstevel@tonic-gate 4207c478bd9Sstevel@tonic-gate next_writer = ts->ts_sleepq[TS_WRITER_Q].sq_first; 4217c478bd9Sstevel@tonic-gate 4227c478bd9Sstevel@tonic-gate if ((old & RW_WRITE_LOCKED) && 4237c478bd9Sstevel@tonic-gate (nreaders = rw_readers_to_wake(ts)) > 0) { 4247c478bd9Sstevel@tonic-gate /* 4257c478bd9Sstevel@tonic-gate * Don't drop the lock -- just set the hold count 4267c478bd9Sstevel@tonic-gate * such that we grant the lock to all readers at once. 4277c478bd9Sstevel@tonic-gate */ 4287c478bd9Sstevel@tonic-gate new = nreaders * RW_READ_LOCK; 4297c478bd9Sstevel@tonic-gate if (ts->ts_waiters > nreaders) 4307c478bd9Sstevel@tonic-gate new |= RW_HAS_WAITERS; 4317c478bd9Sstevel@tonic-gate if (next_writer) 4327c478bd9Sstevel@tonic-gate new |= RW_WRITE_WANTED; 4337c478bd9Sstevel@tonic-gate lp->rw_wwwh = new; 4347c478bd9Sstevel@tonic-gate membar_enter(); 4357c478bd9Sstevel@tonic-gate turnstile_wakeup(ts, TS_READER_Q, nreaders, NULL); 4367c478bd9Sstevel@tonic-gate } else { 4377c478bd9Sstevel@tonic-gate /* 4387c478bd9Sstevel@tonic-gate * Don't drop the lock -- just transfer ownership 4397c478bd9Sstevel@tonic-gate * directly to next_writer. Note that there must 4407c478bd9Sstevel@tonic-gate * be at least one waiting writer, because we get 4417c478bd9Sstevel@tonic-gate * here only if (A) the lock is read-locked or 4427c478bd9Sstevel@tonic-gate * (B) there are no waiting readers. In case (A), 4437c478bd9Sstevel@tonic-gate * since the lock is read-locked there would be no 4447c478bd9Sstevel@tonic-gate * reason for other readers to have blocked unless 4457c478bd9Sstevel@tonic-gate * the RW_WRITE_WANTED bit was set. In case (B), 4467c478bd9Sstevel@tonic-gate * since there are waiters but no waiting readers, 4477c478bd9Sstevel@tonic-gate * they must all be waiting writers. 4487c478bd9Sstevel@tonic-gate */ 4497c478bd9Sstevel@tonic-gate ASSERT(lp->rw_wwwh & RW_WRITE_WANTED); 4507c478bd9Sstevel@tonic-gate new = RW_WRITE_LOCK(next_writer); 4517c478bd9Sstevel@tonic-gate if (ts->ts_waiters > 1) 4527c478bd9Sstevel@tonic-gate new |= RW_HAS_WAITERS; 4537c478bd9Sstevel@tonic-gate if (next_writer->t_link) 4547c478bd9Sstevel@tonic-gate new |= RW_WRITE_WANTED; 4557c478bd9Sstevel@tonic-gate lp->rw_wwwh = new; 4567c478bd9Sstevel@tonic-gate membar_enter(); 4577c478bd9Sstevel@tonic-gate turnstile_wakeup(ts, TS_WRITER_Q, 1, next_writer); 4587c478bd9Sstevel@tonic-gate } 4597c478bd9Sstevel@tonic-gate break; 4607c478bd9Sstevel@tonic-gate } 4617c478bd9Sstevel@tonic-gate 4627c478bd9Sstevel@tonic-gate if (lock_value == RW_READ_LOCK) { 4637c478bd9Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 4647c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE, lp, RW_READER); 4657c478bd9Sstevel@tonic-gate } else { 4667c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE, lp, RW_WRITER); 4677c478bd9Sstevel@tonic-gate } 4687c478bd9Sstevel@tonic-gate } 4697c478bd9Sstevel@tonic-gate 4707c478bd9Sstevel@tonic-gate int 4717c478bd9Sstevel@tonic-gate rw_tryenter(krwlock_t *rwlp, krw_t rw) 4727c478bd9Sstevel@tonic-gate { 4737c478bd9Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 4747c478bd9Sstevel@tonic-gate uintptr_t old; 4757c478bd9Sstevel@tonic-gate 4767c478bd9Sstevel@tonic-gate if (rw == RW_READER) { 477374ae87fSsvemuri uint_t backoff = 0; 478374ae87fSsvemuri int loop_count = 0; 4797c478bd9Sstevel@tonic-gate THREAD_KPRI_REQUEST(); 480374ae87fSsvemuri for (;;) { 4817c478bd9Sstevel@tonic-gate if ((old = lp->rw_wwwh) & RW_WRITE_CLAIMED) { 4827c478bd9Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 4837c478bd9Sstevel@tonic-gate return (0); 4847c478bd9Sstevel@tonic-gate } 485374ae87fSsvemuri if (casip(&lp->rw_wwwh, old, old + RW_READ_LOCK) == old) 486374ae87fSsvemuri break; 487374ae87fSsvemuri if (rw_lock_delay != NULL) { 488374ae87fSsvemuri backoff = rw_lock_backoff(backoff); 489374ae87fSsvemuri rw_lock_delay(backoff); 490374ae87fSsvemuri if (++loop_count == ncpus_online) { 491374ae87fSsvemuri backoff = 0; 492374ae87fSsvemuri loop_count = 0; 493374ae87fSsvemuri } 494374ae87fSsvemuri } 495374ae87fSsvemuri } 4967c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE, lp, rw); 4977c478bd9Sstevel@tonic-gate } else { 4987c478bd9Sstevel@tonic-gate if (casip(&lp->rw_wwwh, 0, RW_WRITE_LOCK(curthread)) != 0) 4997c478bd9Sstevel@tonic-gate return (0); 5007c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE, lp, rw); 5017c478bd9Sstevel@tonic-gate } 5027c478bd9Sstevel@tonic-gate ASSERT(rw_locked(lp, rw)); 5037c478bd9Sstevel@tonic-gate membar_enter(); 5047c478bd9Sstevel@tonic-gate return (1); 5057c478bd9Sstevel@tonic-gate } 5067c478bd9Sstevel@tonic-gate 5077c478bd9Sstevel@tonic-gate void 5087c478bd9Sstevel@tonic-gate rw_downgrade(krwlock_t *rwlp) 5097c478bd9Sstevel@tonic-gate { 5107c478bd9Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 5117c478bd9Sstevel@tonic-gate 5127c478bd9Sstevel@tonic-gate THREAD_KPRI_REQUEST(); 5137c478bd9Sstevel@tonic-gate membar_exit(); 5147c478bd9Sstevel@tonic-gate 5157c478bd9Sstevel@tonic-gate if ((lp->rw_wwwh & RW_OWNER) != (uintptr_t)curthread) { 5167c478bd9Sstevel@tonic-gate rw_panic("rw_downgrade: not owner", lp); 5177c478bd9Sstevel@tonic-gate return; 5187c478bd9Sstevel@tonic-gate } 5197c478bd9Sstevel@tonic-gate 5207c478bd9Sstevel@tonic-gate if (atomic_add_ip_nv(&lp->rw_wwwh, 5217c478bd9Sstevel@tonic-gate RW_READ_LOCK - RW_WRITE_LOCK(curthread)) & RW_HAS_WAITERS) { 5227c478bd9Sstevel@tonic-gate turnstile_t *ts = turnstile_lookup(lp); 5237c478bd9Sstevel@tonic-gate int nreaders = rw_readers_to_wake(ts); 5247c478bd9Sstevel@tonic-gate if (nreaders > 0) { 5257c478bd9Sstevel@tonic-gate uintptr_t delta = nreaders * RW_READ_LOCK; 5267c478bd9Sstevel@tonic-gate if (ts->ts_waiters == nreaders) 5277c478bd9Sstevel@tonic-gate delta -= RW_HAS_WAITERS; 5287c478bd9Sstevel@tonic-gate atomic_add_ip(&lp->rw_wwwh, delta); 5297c478bd9Sstevel@tonic-gate } 5307c478bd9Sstevel@tonic-gate turnstile_wakeup(ts, TS_READER_Q, nreaders, NULL); 5317c478bd9Sstevel@tonic-gate } 5327c478bd9Sstevel@tonic-gate ASSERT(rw_locked(lp, RW_READER)); 5337c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_RW_DOWNGRADE_DOWNGRADE, lp); 5347c478bd9Sstevel@tonic-gate } 5357c478bd9Sstevel@tonic-gate 5367c478bd9Sstevel@tonic-gate int 5377c478bd9Sstevel@tonic-gate rw_tryupgrade(krwlock_t *rwlp) 5387c478bd9Sstevel@tonic-gate { 5397c478bd9Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 5407c478bd9Sstevel@tonic-gate uintptr_t old, new; 5417c478bd9Sstevel@tonic-gate 5427c478bd9Sstevel@tonic-gate ASSERT(rw_locked(lp, RW_READER)); 5437c478bd9Sstevel@tonic-gate 5447c478bd9Sstevel@tonic-gate do { 5457c478bd9Sstevel@tonic-gate if (((old = lp->rw_wwwh) & ~RW_HAS_WAITERS) != RW_READ_LOCK) 5467c478bd9Sstevel@tonic-gate return (0); 5477c478bd9Sstevel@tonic-gate new = old + RW_WRITE_LOCK(curthread) - RW_READ_LOCK; 5487c478bd9Sstevel@tonic-gate } while (casip(&lp->rw_wwwh, old, new) != old); 5497c478bd9Sstevel@tonic-gate 5507c478bd9Sstevel@tonic-gate membar_enter(); 5517c478bd9Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 5527c478bd9Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_RW_TRYUPGRADE_UPGRADE, lp); 5537c478bd9Sstevel@tonic-gate ASSERT(rw_locked(lp, RW_WRITER)); 5547c478bd9Sstevel@tonic-gate return (1); 5557c478bd9Sstevel@tonic-gate } 5567c478bd9Sstevel@tonic-gate 5577c478bd9Sstevel@tonic-gate int 5587c478bd9Sstevel@tonic-gate rw_read_held(krwlock_t *rwlp) 5597c478bd9Sstevel@tonic-gate { 5607c478bd9Sstevel@tonic-gate uintptr_t tmp; 5617c478bd9Sstevel@tonic-gate 5627c478bd9Sstevel@tonic-gate return (_RW_READ_HELD(rwlp, tmp)); 5637c478bd9Sstevel@tonic-gate } 5647c478bd9Sstevel@tonic-gate 5657c478bd9Sstevel@tonic-gate int 5667c478bd9Sstevel@tonic-gate rw_write_held(krwlock_t *rwlp) 5677c478bd9Sstevel@tonic-gate { 5687c478bd9Sstevel@tonic-gate return (_RW_WRITE_HELD(rwlp)); 5697c478bd9Sstevel@tonic-gate } 5707c478bd9Sstevel@tonic-gate 5717c478bd9Sstevel@tonic-gate int 5727c478bd9Sstevel@tonic-gate rw_lock_held(krwlock_t *rwlp) 5737c478bd9Sstevel@tonic-gate { 5747c478bd9Sstevel@tonic-gate return (_RW_LOCK_HELD(rwlp)); 5757c478bd9Sstevel@tonic-gate } 5767c478bd9Sstevel@tonic-gate 5777c478bd9Sstevel@tonic-gate /* 5787c478bd9Sstevel@tonic-gate * Like rw_read_held(), but ASSERTs that the lock is currently held 5797c478bd9Sstevel@tonic-gate */ 5807c478bd9Sstevel@tonic-gate int 5817c478bd9Sstevel@tonic-gate rw_read_locked(krwlock_t *rwlp) 5827c478bd9Sstevel@tonic-gate { 5837c478bd9Sstevel@tonic-gate uintptr_t old = ((rwlock_impl_t *)rwlp)->rw_wwwh; 5847c478bd9Sstevel@tonic-gate 5857c478bd9Sstevel@tonic-gate ASSERT(old & RW_LOCKED); 5867c478bd9Sstevel@tonic-gate return ((old & RW_LOCKED) && !(old & RW_WRITE_LOCKED)); 5877c478bd9Sstevel@tonic-gate } 5887c478bd9Sstevel@tonic-gate 5897c478bd9Sstevel@tonic-gate /* 5907c478bd9Sstevel@tonic-gate * Returns non-zero if the lock is either held or desired by a writer 5917c478bd9Sstevel@tonic-gate */ 5927c478bd9Sstevel@tonic-gate int 5937c478bd9Sstevel@tonic-gate rw_iswriter(krwlock_t *rwlp) 5947c478bd9Sstevel@tonic-gate { 5957c478bd9Sstevel@tonic-gate return (_RW_ISWRITER(rwlp)); 5967c478bd9Sstevel@tonic-gate } 5977c478bd9Sstevel@tonic-gate 5987c478bd9Sstevel@tonic-gate kthread_t * 5997c478bd9Sstevel@tonic-gate rw_owner(krwlock_t *rwlp) 6007c478bd9Sstevel@tonic-gate { 6017c478bd9Sstevel@tonic-gate uintptr_t old = ((rwlock_impl_t *)rwlp)->rw_wwwh; 6027c478bd9Sstevel@tonic-gate 6037c478bd9Sstevel@tonic-gate return ((old & RW_WRITE_LOCKED) ? (kthread_t *)(old & RW_OWNER) : NULL); 6047c478bd9Sstevel@tonic-gate } 605