1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/param.h> 30 #include <sys/thread.h> 31 #include <sys/cmn_err.h> 32 #include <sys/debug.h> 33 #include <sys/cpuvar.h> 34 #include <sys/sobject.h> 35 #include <sys/turnstile.h> 36 #include <sys/rwlock.h> 37 #include <sys/rwlock_impl.h> 38 #include <sys/atomic.h> 39 #include <sys/lockstat.h> 40 41 /* 42 * Big Theory Statement for readers/writer locking primitives. 43 * 44 * An rwlock provides exclusive access to a single thread ("writer") or 45 * concurrent access to multiple threads ("readers"). See rwlock(9F) 46 * for a full description of the interfaces and programming model. 47 * The rest of this comment describes the implementation. 48 * 49 * An rwlock is a single word with the following structure: 50 * 51 * --------------------------------------------------------------------- 52 * | OWNER (writer) or HOLD COUNT (readers) | WRLOCK | WRWANT | WAIT | 53 * --------------------------------------------------------------------- 54 * 63 / 31 .. 3 2 1 0 55 * 56 * The waiters bit (0) indicates whether any threads are blocked waiting 57 * for the lock. The write-wanted bit (1) indicates whether any threads 58 * are blocked waiting for write access. The write-locked bit (2) indicates 59 * whether the lock is held by a writer, which determines whether the upper 60 * bits (3..31 in ILP32, 3..63 in LP64) should be interpreted as the owner 61 * (thread pointer) or the hold count (number of readers). 62 * 63 * In the absence of any contention, a writer gets the lock by setting 64 * this word to (curthread | RW_WRITE_LOCKED); a reader gets the lock 65 * by incrementing the hold count (i.e. adding 8, aka RW_READ_LOCK). 66 * 67 * A writer will fail to acquire the lock if any other thread owns it. 68 * A reader will fail if the lock is either owned or wanted by a writer. 69 * rw_tryenter() returns 0 in these cases; rw_enter() blocks until the 70 * lock becomes available. 71 * 72 * When a thread blocks it acquires the rwlock's hashed turnstile lock and 73 * attempts to set RW_HAS_WAITERS (and RW_WRITE_WANTED in the writer case) 74 * atomically *only if the lock still appears busy*. A thread must never 75 * accidentally block for an available lock since there would be no owner 76 * to awaken it. casip() provides the required atomicity. Once casip() 77 * succeeds, the decision to block becomes final and irreversible. The 78 * thread will not become runnable again until it has been granted ownership 79 * of the lock via direct handoff from a former owner as described below. 80 * 81 * In the absence of any waiters, rw_exit() just clears the lock (if it 82 * is write-locked) or decrements the hold count (if it is read-locked). 83 * Note that even if waiters are present, decrementing the hold count 84 * to a non-zero value requires no special action since the lock is still 85 * held by at least one other thread. 86 * 87 * On the "final exit" (transition to unheld state) of a lock with waiters, 88 * rw_exit_wakeup() grabs the turnstile lock and transfers ownership directly 89 * to the next writer or set of readers. There are several advantages to this 90 * approach: (1) it closes all windows for priority inversion (when a new 91 * writer has grabbed the lock but has not yet inherited from blocked readers); 92 * (2) it prevents starvation of equal-priority threads by granting the lock 93 * in FIFO order; (3) it eliminates the need for a write-wanted count -- a 94 * single bit suffices because the lock remains held until all waiting 95 * writers are gone; (4) when we awaken N readers we can perform a single 96 * "atomic_add(&x, N)" to set the total hold count rather than having all N 97 * threads fight for the cache to perform an "atomic_add(&x, 1)" upon wakeup. 98 * 99 * The most interesting policy decision in rw_exit_wakeup() is which thread 100 * to wake. Starvation is always possible with priority-based scheduling, 101 * but any sane wakeup policy should at least satisfy these requirements: 102 * 103 * (1) The highest-priority thread in the system should not starve. 104 * (2) The highest-priority writer should not starve. 105 * (3) No writer should starve due to lower-priority threads. 106 * (4) No reader should starve due to lower-priority writers. 107 * (5) If all threads have equal priority, none of them should starve. 108 * 109 * We used to employ a writers-always-win policy, which doesn't even 110 * satisfy (1): a steady stream of low-priority writers can starve out 111 * a real-time reader! This is clearly a broken policy -- it violates 112 * (1), (4), and (5) -- but it's how rwlocks always used to behave. 113 * 114 * A round-robin policy (exiting readers grant the lock to blocked writers 115 * and vice versa) satisfies all but (3): a single high-priority writer 116 * and many low-priority readers can starve out medium-priority writers. 117 * 118 * A strict priority policy (grant the lock to the highest priority blocked 119 * thread) satisfies everything but (2): a steady stream of high-priority 120 * readers can permanently starve the highest-priority writer. 121 * 122 * The reason we care about (2) is that it's important to process writers 123 * reasonably quickly -- even if they're low priority -- because their very 124 * presence causes all readers to take the slow (blocking) path through this 125 * code. There is also a general sense that writers deserve some degree of 126 * deference because they're updating the data upon which all readers act. 127 * Presumably this data should not be allowed to become arbitrarily stale 128 * due to writer starvation. Finally, it seems reasonable to level the 129 * playing field a bit to compensate for the fact that it's so much harder 130 * for a writer to get in when there are already many readers present. 131 * 132 * A hybrid of round-robin and strict priority can be made to satisfy 133 * all five criteria. In this "writer priority policy" exiting readers 134 * always grant the lock to waiting writers, but exiting writers only 135 * grant the lock to readers of the same or higher priority than the 136 * highest-priority blocked writer. Thus requirement (2) is satisfied, 137 * necessarily, by a willful act of priority inversion: an exiting reader 138 * will grant the lock to a blocked writer even if there are blocked 139 * readers of higher priority. The situation is mitigated by the fact 140 * that writers always inherit priority from blocked readers, and the 141 * writer will awaken those readers as soon as it exits the lock. 142 * 143 * rw_downgrade() follows the same wakeup policy as an exiting writer. 144 * 145 * rw_tryupgrade() has the same failure mode as rw_tryenter() for a 146 * write lock. Both honor the WRITE_WANTED bit by specification. 147 * 148 * The following rules apply to manipulation of rwlock internal state: 149 * 150 * (1) The rwlock is only modified via the atomic primitives casip() 151 * and atomic_add_ip(). 152 * 153 * (2) The waiters bit and write-wanted bit are only modified under 154 * turnstile_lookup(). This ensures that the turnstile is consistent 155 * with the rwlock. 156 * 157 * (3) Waiters receive the lock by direct handoff from the previous 158 * owner. Therefore, waiters *always* wake up holding the lock. 159 */ 160 161 /* 162 * The sobj_ops vector exports a set of functions needed when a thread 163 * is asleep on a synchronization object of a given type. 164 */ 165 static sobj_ops_t rw_sobj_ops = { 166 SOBJ_RWLOCK, rw_owner, turnstile_stay_asleep, turnstile_change_pri 167 }; 168 169 /* 170 * If the system panics on an rwlock, save the address of the offending 171 * rwlock in panic_rwlock_addr, and save the contents in panic_rwlock. 172 */ 173 static rwlock_impl_t panic_rwlock; 174 static rwlock_impl_t *panic_rwlock_addr; 175 176 static void 177 rw_panic(char *msg, rwlock_impl_t *lp) 178 { 179 if (panicstr) 180 return; 181 182 if (casptr(&panic_rwlock_addr, NULL, lp) == NULL) 183 panic_rwlock = *lp; 184 185 panic("%s, lp=%p wwwh=%lx thread=%p", 186 msg, lp, panic_rwlock.rw_wwwh, curthread); 187 } 188 189 /* ARGSUSED */ 190 void 191 rw_init(krwlock_t *rwlp, char *name, krw_type_t type, void *arg) 192 { 193 ((rwlock_impl_t *)rwlp)->rw_wwwh = 0; 194 } 195 196 void 197 rw_destroy(krwlock_t *rwlp) 198 { 199 rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 200 201 if (lp->rw_wwwh != 0) { 202 if ((lp->rw_wwwh & RW_DOUBLE_LOCK) == RW_DOUBLE_LOCK) 203 rw_panic("rw_destroy: lock already destroyed", lp); 204 else 205 rw_panic("rw_destroy: lock still active", lp); 206 } 207 208 lp->rw_wwwh = RW_DOUBLE_LOCK; 209 } 210 211 /* 212 * Verify that an rwlock is held correctly. 213 */ 214 static int 215 rw_locked(rwlock_impl_t *lp, krw_t rw) 216 { 217 uintptr_t old = lp->rw_wwwh; 218 219 if (rw == RW_READER) 220 return ((old & RW_LOCKED) && !(old & RW_WRITE_LOCKED)); 221 222 if (rw == RW_WRITER) 223 return ((old & RW_OWNER) == (uintptr_t)curthread); 224 225 return (0); 226 } 227 228 /* 229 * Full-service implementation of rw_enter() to handle all the hard cases. 230 * Called from the assembly version if anything complicated is going on. 231 * The only semantic difference between calling rw_enter() and calling 232 * rw_enter_sleep() directly is that we assume the caller has already done 233 * a THREAD_KPRI_REQUEST() in the RW_READER case. 234 */ 235 void 236 rw_enter_sleep(rwlock_impl_t *lp, krw_t rw) 237 { 238 uintptr_t old, new, lock_value, lock_busy, lock_wait; 239 hrtime_t sleep_time; 240 turnstile_t *ts; 241 242 if (rw == RW_READER) { 243 lock_value = RW_READ_LOCK; 244 lock_busy = RW_WRITE_CLAIMED; 245 lock_wait = RW_HAS_WAITERS; 246 } else { 247 lock_value = RW_WRITE_LOCK(curthread); 248 lock_busy = (uintptr_t)RW_LOCKED; 249 lock_wait = RW_HAS_WAITERS | RW_WRITE_WANTED; 250 } 251 252 for (;;) { 253 if (((old = lp->rw_wwwh) & lock_busy) == 0) { 254 if (casip(&lp->rw_wwwh, old, old + lock_value) != old) 255 continue; 256 break; 257 } 258 259 if (panicstr) 260 return; 261 262 if ((old & RW_DOUBLE_LOCK) == RW_DOUBLE_LOCK) { 263 rw_panic("rw_enter: bad rwlock", lp); 264 return; 265 } 266 267 if ((old & RW_OWNER) == (uintptr_t)curthread) { 268 rw_panic("recursive rw_enter", lp); 269 return; 270 } 271 272 ts = turnstile_lookup(lp); 273 274 do { 275 if (((old = lp->rw_wwwh) & lock_busy) == 0) 276 break; 277 new = old | lock_wait; 278 } while (old != new && casip(&lp->rw_wwwh, old, new) != old); 279 280 if ((old & lock_busy) == 0) { 281 /* 282 * The lock appears free now; try the dance again 283 */ 284 turnstile_exit(lp); 285 continue; 286 } 287 288 /* 289 * We really are going to block. Bump the stats, and drop 290 * kpri if we're a reader. 291 */ 292 ASSERT(lp->rw_wwwh & lock_wait); 293 ASSERT(lp->rw_wwwh & RW_LOCKED); 294 295 sleep_time = -gethrtime(); 296 if (rw == RW_READER) { 297 THREAD_KPRI_RELEASE(); 298 CPU_STATS_ADDQ(CPU, sys, rw_rdfails, 1); 299 (void) turnstile_block(ts, TS_READER_Q, lp, 300 &rw_sobj_ops, NULL, NULL); 301 } else { 302 CPU_STATS_ADDQ(CPU, sys, rw_wrfails, 1); 303 (void) turnstile_block(ts, TS_WRITER_Q, lp, 304 &rw_sobj_ops, NULL, NULL); 305 } 306 sleep_time += gethrtime(); 307 308 LOCKSTAT_RECORD4(LS_RW_ENTER_BLOCK, lp, sleep_time, rw, 309 (old & RW_WRITE_LOCKED) ? 1 : 0, 310 old >> RW_HOLD_COUNT_SHIFT); 311 312 /* 313 * We wake up holding the lock (and having kpri if we're 314 * a reader) via direct handoff from the previous owner. 315 */ 316 break; 317 } 318 319 ASSERT(rw_locked(lp, rw)); 320 321 membar_enter(); 322 323 LOCKSTAT_RECORD(LS_RW_ENTER_ACQUIRE, lp, rw); 324 } 325 326 /* 327 * Return the number of readers to wake, or zero if we should wake a writer. 328 * Called only by exiting/downgrading writers (readers don't wake readers). 329 */ 330 static int 331 rw_readers_to_wake(turnstile_t *ts) 332 { 333 kthread_t *next_writer = ts->ts_sleepq[TS_WRITER_Q].sq_first; 334 kthread_t *next_reader = ts->ts_sleepq[TS_READER_Q].sq_first; 335 pri_t wpri = (next_writer != NULL) ? DISP_PRIO(next_writer) : -1; 336 int count = 0; 337 338 while (next_reader != NULL) { 339 if (DISP_PRIO(next_reader) < wpri) 340 break; 341 next_reader->t_kpri_req++; 342 next_reader = next_reader->t_link; 343 count++; 344 } 345 return (count); 346 } 347 348 /* 349 * Full-service implementation of rw_exit() to handle all the hard cases. 350 * Called from the assembly version if anything complicated is going on. 351 * There is no semantic difference between calling rw_exit() and calling 352 * rw_exit_wakeup() directly. 353 */ 354 void 355 rw_exit_wakeup(rwlock_impl_t *lp) 356 { 357 turnstile_t *ts; 358 uintptr_t old, new, lock_value; 359 kthread_t *next_writer; 360 int nreaders; 361 362 membar_exit(); 363 364 old = lp->rw_wwwh; 365 if (old & RW_WRITE_LOCKED) { 366 if ((old & RW_OWNER) != (uintptr_t)curthread) { 367 rw_panic("rw_exit: not owner", lp); 368 lp->rw_wwwh = 0; 369 return; 370 } 371 lock_value = RW_WRITE_LOCK(curthread); 372 } else { 373 if ((old & RW_LOCKED) == 0) { 374 rw_panic("rw_exit: lock not held", lp); 375 return; 376 } 377 lock_value = RW_READ_LOCK; 378 } 379 380 for (;;) { 381 /* 382 * If this is *not* the final exit of a lock with waiters, 383 * just drop the lock -- there's nothing tricky going on. 384 */ 385 old = lp->rw_wwwh; 386 new = old - lock_value; 387 if ((new & (RW_LOCKED | RW_HAS_WAITERS)) != RW_HAS_WAITERS) { 388 if (casip(&lp->rw_wwwh, old, new) != old) 389 continue; 390 break; 391 } 392 393 /* 394 * Perform the final exit of a lock that has waiters. 395 */ 396 ts = turnstile_lookup(lp); 397 398 next_writer = ts->ts_sleepq[TS_WRITER_Q].sq_first; 399 400 if ((old & RW_WRITE_LOCKED) && 401 (nreaders = rw_readers_to_wake(ts)) > 0) { 402 /* 403 * Don't drop the lock -- just set the hold count 404 * such that we grant the lock to all readers at once. 405 */ 406 new = nreaders * RW_READ_LOCK; 407 if (ts->ts_waiters > nreaders) 408 new |= RW_HAS_WAITERS; 409 if (next_writer) 410 new |= RW_WRITE_WANTED; 411 lp->rw_wwwh = new; 412 membar_enter(); 413 turnstile_wakeup(ts, TS_READER_Q, nreaders, NULL); 414 } else { 415 /* 416 * Don't drop the lock -- just transfer ownership 417 * directly to next_writer. Note that there must 418 * be at least one waiting writer, because we get 419 * here only if (A) the lock is read-locked or 420 * (B) there are no waiting readers. In case (A), 421 * since the lock is read-locked there would be no 422 * reason for other readers to have blocked unless 423 * the RW_WRITE_WANTED bit was set. In case (B), 424 * since there are waiters but no waiting readers, 425 * they must all be waiting writers. 426 */ 427 ASSERT(lp->rw_wwwh & RW_WRITE_WANTED); 428 new = RW_WRITE_LOCK(next_writer); 429 if (ts->ts_waiters > 1) 430 new |= RW_HAS_WAITERS; 431 if (next_writer->t_link) 432 new |= RW_WRITE_WANTED; 433 lp->rw_wwwh = new; 434 membar_enter(); 435 turnstile_wakeup(ts, TS_WRITER_Q, 1, next_writer); 436 } 437 break; 438 } 439 440 if (lock_value == RW_READ_LOCK) { 441 THREAD_KPRI_RELEASE(); 442 LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE, lp, RW_READER); 443 } else { 444 LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE, lp, RW_WRITER); 445 } 446 } 447 448 int 449 rw_tryenter(krwlock_t *rwlp, krw_t rw) 450 { 451 rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 452 uintptr_t old; 453 454 if (rw == RW_READER) { 455 THREAD_KPRI_REQUEST(); 456 do { 457 if ((old = lp->rw_wwwh) & RW_WRITE_CLAIMED) { 458 THREAD_KPRI_RELEASE(); 459 return (0); 460 } 461 } while (casip(&lp->rw_wwwh, old, old + RW_READ_LOCK) != old); 462 LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE, lp, rw); 463 } else { 464 if (casip(&lp->rw_wwwh, 0, RW_WRITE_LOCK(curthread)) != 0) 465 return (0); 466 LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE, lp, rw); 467 } 468 ASSERT(rw_locked(lp, rw)); 469 membar_enter(); 470 return (1); 471 } 472 473 void 474 rw_downgrade(krwlock_t *rwlp) 475 { 476 rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 477 478 THREAD_KPRI_REQUEST(); 479 membar_exit(); 480 481 if ((lp->rw_wwwh & RW_OWNER) != (uintptr_t)curthread) { 482 rw_panic("rw_downgrade: not owner", lp); 483 return; 484 } 485 486 if (atomic_add_ip_nv(&lp->rw_wwwh, 487 RW_READ_LOCK - RW_WRITE_LOCK(curthread)) & RW_HAS_WAITERS) { 488 turnstile_t *ts = turnstile_lookup(lp); 489 int nreaders = rw_readers_to_wake(ts); 490 if (nreaders > 0) { 491 uintptr_t delta = nreaders * RW_READ_LOCK; 492 if (ts->ts_waiters == nreaders) 493 delta -= RW_HAS_WAITERS; 494 atomic_add_ip(&lp->rw_wwwh, delta); 495 } 496 turnstile_wakeup(ts, TS_READER_Q, nreaders, NULL); 497 } 498 ASSERT(rw_locked(lp, RW_READER)); 499 LOCKSTAT_RECORD0(LS_RW_DOWNGRADE_DOWNGRADE, lp); 500 } 501 502 int 503 rw_tryupgrade(krwlock_t *rwlp) 504 { 505 rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 506 uintptr_t old, new; 507 508 ASSERT(rw_locked(lp, RW_READER)); 509 510 do { 511 if (((old = lp->rw_wwwh) & ~RW_HAS_WAITERS) != RW_READ_LOCK) 512 return (0); 513 new = old + RW_WRITE_LOCK(curthread) - RW_READ_LOCK; 514 } while (casip(&lp->rw_wwwh, old, new) != old); 515 516 membar_enter(); 517 THREAD_KPRI_RELEASE(); 518 LOCKSTAT_RECORD0(LS_RW_TRYUPGRADE_UPGRADE, lp); 519 ASSERT(rw_locked(lp, RW_WRITER)); 520 return (1); 521 } 522 523 int 524 rw_read_held(krwlock_t *rwlp) 525 { 526 uintptr_t tmp; 527 528 return (_RW_READ_HELD(rwlp, tmp)); 529 } 530 531 int 532 rw_write_held(krwlock_t *rwlp) 533 { 534 return (_RW_WRITE_HELD(rwlp)); 535 } 536 537 int 538 rw_lock_held(krwlock_t *rwlp) 539 { 540 return (_RW_LOCK_HELD(rwlp)); 541 } 542 543 /* 544 * Like rw_read_held(), but ASSERTs that the lock is currently held 545 */ 546 int 547 rw_read_locked(krwlock_t *rwlp) 548 { 549 uintptr_t old = ((rwlock_impl_t *)rwlp)->rw_wwwh; 550 551 ASSERT(old & RW_LOCKED); 552 return ((old & RW_LOCKED) && !(old & RW_WRITE_LOCKED)); 553 } 554 555 /* 556 * Returns non-zero if the lock is either held or desired by a writer 557 */ 558 int 559 rw_iswriter(krwlock_t *rwlp) 560 { 561 return (_RW_ISWRITER(rwlp)); 562 } 563 564 kthread_t * 565 rw_owner(krwlock_t *rwlp) 566 { 567 uintptr_t old = ((rwlock_impl_t *)rwlp)->rw_wwwh; 568 569 return ((old & RW_WRITE_LOCKED) ? (kthread_t *)(old & RW_OWNER) : NULL); 570 } 571