1 // SPDX-License-Identifier: GPL-2.0 2 /* kernel/rwsem.c: R/W semaphores, public implementation 3 * 4 * Written by David Howells (dhowells@redhat.com). 5 * Derived from asm-i386/semaphore.h 6 * 7 * Writer lock-stealing by Alex Shi <alex.shi@intel.com> 8 * and Michel Lespinasse <walken@google.com> 9 * 10 * Optimistic spinning by Tim Chen <tim.c.chen@intel.com> 11 * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. 12 * 13 * Rwsem count bit fields re-definition and rwsem rearchitecture by 14 * Waiman Long <longman@redhat.com> and 15 * Peter Zijlstra <peterz@infradead.org>. 16 */ 17 18 #include <linux/types.h> 19 #include <linux/kernel.h> 20 #include <linux/sched.h> 21 #include <linux/sched/rt.h> 22 #include <linux/sched/task.h> 23 #include <linux/sched/debug.h> 24 #include <linux/sched/wake_q.h> 25 #include <linux/sched/signal.h> 26 #include <linux/sched/clock.h> 27 #include <linux/export.h> 28 #include <linux/rwsem.h> 29 #include <linux/atomic.h> 30 31 #include "rwsem.h" 32 #include "lock_events.h" 33 34 /* 35 * The least significant 3 bits of the owner value has the following 36 * meanings when set. 37 * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers 38 * - Bit 1: RWSEM_RD_NONSPINNABLE - Readers cannot spin on this lock. 39 * - Bit 2: RWSEM_WR_NONSPINNABLE - Writers cannot spin on this lock. 40 * 41 * When the rwsem is either owned by an anonymous writer, or it is 42 * reader-owned, but a spinning writer has timed out, both nonspinnable 43 * bits will be set to disable optimistic spinning by readers and writers. 44 * In the later case, the last unlocking reader should then check the 45 * writer nonspinnable bit and clear it only to give writers preference 46 * to acquire the lock via optimistic spinning, but not readers. Similar 47 * action is also done in the reader slowpath. 48 49 * When a writer acquires a rwsem, it puts its task_struct pointer 50 * into the owner field. It is cleared after an unlock. 51 * 52 * When a reader acquires a rwsem, it will also puts its task_struct 53 * pointer into the owner field with the RWSEM_READER_OWNED bit set. 54 * On unlock, the owner field will largely be left untouched. So 55 * for a free or reader-owned rwsem, the owner value may contain 56 * information about the last reader that acquires the rwsem. 57 * 58 * That information may be helpful in debugging cases where the system 59 * seems to hang on a reader owned rwsem especially if only one reader 60 * is involved. Ideally we would like to track all the readers that own 61 * a rwsem, but the overhead is simply too big. 62 * 63 * Reader optimistic spinning is helpful when the reader critical section 64 * is short and there aren't that many readers around. It makes readers 65 * relatively more preferred than writers. When a writer times out spinning 66 * on a reader-owned lock and set the nospinnable bits, there are two main 67 * reasons for that. 68 * 69 * 1) The reader critical section is long, perhaps the task sleeps after 70 * acquiring the read lock. 71 * 2) There are just too many readers contending the lock causing it to 72 * take a while to service all of them. 73 * 74 * In the former case, long reader critical section will impede the progress 75 * of writers which is usually more important for system performance. In 76 * the later case, reader optimistic spinning tends to make the reader 77 * groups that contain readers that acquire the lock together smaller 78 * leading to more of them. That may hurt performance in some cases. In 79 * other words, the setting of nonspinnable bits indicates that reader 80 * optimistic spinning may not be helpful for those workloads that cause 81 * it. 82 * 83 * Therefore, any writers that had observed the setting of the writer 84 * nonspinnable bit for a given rwsem after they fail to acquire the lock 85 * via optimistic spinning will set the reader nonspinnable bit once they 86 * acquire the write lock. Similarly, readers that observe the setting 87 * of reader nonspinnable bit at slowpath entry will set the reader 88 * nonspinnable bits when they acquire the read lock via the wakeup path. 89 * 90 * Once the reader nonspinnable bit is on, it will only be reset when 91 * a writer is able to acquire the rwsem in the fast path or somehow a 92 * reader or writer in the slowpath doesn't observe the nonspinable bit. 93 * 94 * This is to discourage reader optmistic spinning on that particular 95 * rwsem and make writers more preferred. This adaptive disabling of reader 96 * optimistic spinning will alleviate the negative side effect of this 97 * feature. 98 */ 99 #define RWSEM_READER_OWNED (1UL << 0) 100 #define RWSEM_RD_NONSPINNABLE (1UL << 1) 101 #define RWSEM_WR_NONSPINNABLE (1UL << 2) 102 #define RWSEM_NONSPINNABLE (RWSEM_RD_NONSPINNABLE | RWSEM_WR_NONSPINNABLE) 103 #define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE) 104 105 #ifdef CONFIG_DEBUG_RWSEMS 106 # define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ 107 if (!debug_locks_silent && \ 108 WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ 109 #c, atomic_long_read(&(sem)->count), \ 110 (unsigned long) sem->magic, \ 111 atomic_long_read(&(sem)->owner), (long)current, \ 112 list_empty(&(sem)->wait_list) ? "" : "not ")) \ 113 debug_locks_off(); \ 114 } while (0) 115 #else 116 # define DEBUG_RWSEMS_WARN_ON(c, sem) 117 #endif 118 119 /* 120 * On 64-bit architectures, the bit definitions of the count are: 121 * 122 * Bit 0 - writer locked bit 123 * Bit 1 - waiters present bit 124 * Bit 2 - lock handoff bit 125 * Bits 3-7 - reserved 126 * Bits 8-62 - 55-bit reader count 127 * Bit 63 - read fail bit 128 * 129 * On 32-bit architectures, the bit definitions of the count are: 130 * 131 * Bit 0 - writer locked bit 132 * Bit 1 - waiters present bit 133 * Bit 2 - lock handoff bit 134 * Bits 3-7 - reserved 135 * Bits 8-30 - 23-bit reader count 136 * Bit 31 - read fail bit 137 * 138 * It is not likely that the most significant bit (read fail bit) will ever 139 * be set. This guard bit is still checked anyway in the down_read() fastpath 140 * just in case we need to use up more of the reader bits for other purpose 141 * in the future. 142 * 143 * atomic_long_fetch_add() is used to obtain reader lock, whereas 144 * atomic_long_cmpxchg() will be used to obtain writer lock. 145 * 146 * There are three places where the lock handoff bit may be set or cleared. 147 * 1) rwsem_mark_wake() for readers. 148 * 2) rwsem_try_write_lock() for writers. 149 * 3) Error path of rwsem_down_write_slowpath(). 150 * 151 * For all the above cases, wait_lock will be held. A writer must also 152 * be the first one in the wait_list to be eligible for setting the handoff 153 * bit. So concurrent setting/clearing of handoff bit is not possible. 154 */ 155 #define RWSEM_WRITER_LOCKED (1UL << 0) 156 #define RWSEM_FLAG_WAITERS (1UL << 1) 157 #define RWSEM_FLAG_HANDOFF (1UL << 2) 158 #define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1)) 159 160 #define RWSEM_READER_SHIFT 8 161 #define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT) 162 #define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1)) 163 #define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED 164 #define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK) 165 #define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\ 166 RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL) 167 168 /* 169 * All writes to owner are protected by WRITE_ONCE() to make sure that 170 * store tearing can't happen as optimistic spinners may read and use 171 * the owner value concurrently without lock. Read from owner, however, 172 * may not need READ_ONCE() as long as the pointer value is only used 173 * for comparison and isn't being dereferenced. 174 */ 175 static inline void rwsem_set_owner(struct rw_semaphore *sem) 176 { 177 atomic_long_set(&sem->owner, (long)current); 178 } 179 180 static inline void rwsem_clear_owner(struct rw_semaphore *sem) 181 { 182 atomic_long_set(&sem->owner, 0); 183 } 184 185 /* 186 * Test the flags in the owner field. 187 */ 188 static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags) 189 { 190 return atomic_long_read(&sem->owner) & flags; 191 } 192 193 /* 194 * The task_struct pointer of the last owning reader will be left in 195 * the owner field. 196 * 197 * Note that the owner value just indicates the task has owned the rwsem 198 * previously, it may not be the real owner or one of the real owners 199 * anymore when that field is examined, so take it with a grain of salt. 200 * 201 * The reader non-spinnable bit is preserved. 202 */ 203 static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, 204 struct task_struct *owner) 205 { 206 unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | 207 (atomic_long_read(&sem->owner) & RWSEM_RD_NONSPINNABLE); 208 209 atomic_long_set(&sem->owner, val); 210 } 211 212 static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) 213 { 214 __rwsem_set_reader_owned(sem, current); 215 } 216 217 /* 218 * Return true if the rwsem is owned by a reader. 219 */ 220 static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) 221 { 222 #ifdef CONFIG_DEBUG_RWSEMS 223 /* 224 * Check the count to see if it is write-locked. 225 */ 226 long count = atomic_long_read(&sem->count); 227 228 if (count & RWSEM_WRITER_MASK) 229 return false; 230 #endif 231 return rwsem_test_oflags(sem, RWSEM_READER_OWNED); 232 } 233 234 #ifdef CONFIG_DEBUG_RWSEMS 235 /* 236 * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there 237 * is a task pointer in owner of a reader-owned rwsem, it will be the 238 * real owner or one of the real owners. The only exception is when the 239 * unlock is done by up_read_non_owner(). 240 */ 241 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) 242 { 243 unsigned long val = atomic_long_read(&sem->owner); 244 245 while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) { 246 if (atomic_long_try_cmpxchg(&sem->owner, &val, 247 val & RWSEM_OWNER_FLAGS_MASK)) 248 return; 249 } 250 } 251 #else 252 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) 253 { 254 } 255 #endif 256 257 /* 258 * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag 259 * remains set. Otherwise, the operation will be aborted. 260 */ 261 static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) 262 { 263 unsigned long owner = atomic_long_read(&sem->owner); 264 265 do { 266 if (!(owner & RWSEM_READER_OWNED)) 267 break; 268 if (owner & RWSEM_NONSPINNABLE) 269 break; 270 } while (!atomic_long_try_cmpxchg(&sem->owner, &owner, 271 owner | RWSEM_NONSPINNABLE)); 272 } 273 274 static inline bool rwsem_read_trylock(struct rw_semaphore *sem) 275 { 276 long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); 277 if (WARN_ON_ONCE(cnt < 0)) 278 rwsem_set_nonspinnable(sem); 279 return !(cnt & RWSEM_READ_FAILED_MASK); 280 } 281 282 /* 283 * Return just the real task structure pointer of the owner 284 */ 285 static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) 286 { 287 return (struct task_struct *) 288 (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); 289 } 290 291 /* 292 * Return the real task structure pointer of the owner and the embedded 293 * flags in the owner. pflags must be non-NULL. 294 */ 295 static inline struct task_struct * 296 rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags) 297 { 298 unsigned long owner = atomic_long_read(&sem->owner); 299 300 *pflags = owner & RWSEM_OWNER_FLAGS_MASK; 301 return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK); 302 } 303 304 /* 305 * Guide to the rw_semaphore's count field. 306 * 307 * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned 308 * by a writer. 309 * 310 * The lock is owned by readers when 311 * (1) the RWSEM_WRITER_LOCKED isn't set in count, 312 * (2) some of the reader bits are set in count, and 313 * (3) the owner field has RWSEM_READ_OWNED bit set. 314 * 315 * Having some reader bits set is not enough to guarantee a readers owned 316 * lock as the readers may be in the process of backing out from the count 317 * and a writer has just released the lock. So another writer may steal 318 * the lock immediately after that. 319 */ 320 321 /* 322 * Initialize an rwsem: 323 */ 324 void __init_rwsem(struct rw_semaphore *sem, const char *name, 325 struct lock_class_key *key) 326 { 327 #ifdef CONFIG_DEBUG_LOCK_ALLOC 328 /* 329 * Make sure we are not reinitializing a held semaphore: 330 */ 331 debug_check_no_locks_freed((void *)sem, sizeof(*sem)); 332 lockdep_init_map(&sem->dep_map, name, key, 0); 333 #endif 334 #ifdef CONFIG_DEBUG_RWSEMS 335 sem->magic = sem; 336 #endif 337 atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); 338 raw_spin_lock_init(&sem->wait_lock); 339 INIT_LIST_HEAD(&sem->wait_list); 340 atomic_long_set(&sem->owner, 0L); 341 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER 342 osq_lock_init(&sem->osq); 343 #endif 344 } 345 EXPORT_SYMBOL(__init_rwsem); 346 347 enum rwsem_waiter_type { 348 RWSEM_WAITING_FOR_WRITE, 349 RWSEM_WAITING_FOR_READ 350 }; 351 352 struct rwsem_waiter { 353 struct list_head list; 354 struct task_struct *task; 355 enum rwsem_waiter_type type; 356 unsigned long timeout; 357 unsigned long last_rowner; 358 }; 359 #define rwsem_first_waiter(sem) \ 360 list_first_entry(&sem->wait_list, struct rwsem_waiter, list) 361 362 enum rwsem_wake_type { 363 RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ 364 RWSEM_WAKE_READERS, /* Wake readers only */ 365 RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ 366 }; 367 368 enum writer_wait_state { 369 WRITER_NOT_FIRST, /* Writer is not first in wait list */ 370 WRITER_FIRST, /* Writer is first in wait list */ 371 WRITER_HANDOFF /* Writer is first & handoff needed */ 372 }; 373 374 /* 375 * The typical HZ value is either 250 or 1000. So set the minimum waiting 376 * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait 377 * queue before initiating the handoff protocol. 378 */ 379 #define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250) 380 381 /* 382 * Magic number to batch-wakeup waiting readers, even when writers are 383 * also present in the queue. This both limits the amount of work the 384 * waking thread must do and also prevents any potential counter overflow, 385 * however unlikely. 386 */ 387 #define MAX_READERS_WAKEUP 0x100 388 389 /* 390 * handle the lock release when processes blocked on it that can now run 391 * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must 392 * have been set. 393 * - there must be someone on the queue 394 * - the wait_lock must be held by the caller 395 * - tasks are marked for wakeup, the caller must later invoke wake_up_q() 396 * to actually wakeup the blocked task(s) and drop the reference count, 397 * preferably when the wait_lock is released 398 * - woken process blocks are discarded from the list after having task zeroed 399 * - writers are only marked woken if downgrading is false 400 */ 401 static void rwsem_mark_wake(struct rw_semaphore *sem, 402 enum rwsem_wake_type wake_type, 403 struct wake_q_head *wake_q) 404 { 405 struct rwsem_waiter *waiter, *tmp; 406 long oldcount, woken = 0, adjustment = 0; 407 struct list_head wlist; 408 409 lockdep_assert_held(&sem->wait_lock); 410 411 /* 412 * Take a peek at the queue head waiter such that we can determine 413 * the wakeup(s) to perform. 414 */ 415 waiter = rwsem_first_waiter(sem); 416 417 if (waiter->type == RWSEM_WAITING_FOR_WRITE) { 418 if (wake_type == RWSEM_WAKE_ANY) { 419 /* 420 * Mark writer at the front of the queue for wakeup. 421 * Until the task is actually later awoken later by 422 * the caller, other writers are able to steal it. 423 * Readers, on the other hand, will block as they 424 * will notice the queued writer. 425 */ 426 wake_q_add(wake_q, waiter->task); 427 lockevent_inc(rwsem_wake_writer); 428 } 429 430 return; 431 } 432 433 /* 434 * No reader wakeup if there are too many of them already. 435 */ 436 if (unlikely(atomic_long_read(&sem->count) < 0)) 437 return; 438 439 /* 440 * Writers might steal the lock before we grant it to the next reader. 441 * We prefer to do the first reader grant before counting readers 442 * so we can bail out early if a writer stole the lock. 443 */ 444 if (wake_type != RWSEM_WAKE_READ_OWNED) { 445 struct task_struct *owner; 446 447 adjustment = RWSEM_READER_BIAS; 448 oldcount = atomic_long_fetch_add(adjustment, &sem->count); 449 if (unlikely(oldcount & RWSEM_WRITER_MASK)) { 450 /* 451 * When we've been waiting "too" long (for writers 452 * to give up the lock), request a HANDOFF to 453 * force the issue. 454 */ 455 if (!(oldcount & RWSEM_FLAG_HANDOFF) && 456 time_after(jiffies, waiter->timeout)) { 457 adjustment -= RWSEM_FLAG_HANDOFF; 458 lockevent_inc(rwsem_rlock_handoff); 459 } 460 461 atomic_long_add(-adjustment, &sem->count); 462 return; 463 } 464 /* 465 * Set it to reader-owned to give spinners an early 466 * indication that readers now have the lock. 467 * The reader nonspinnable bit seen at slowpath entry of 468 * the reader is copied over. 469 */ 470 owner = waiter->task; 471 if (waiter->last_rowner & RWSEM_RD_NONSPINNABLE) { 472 owner = (void *)((unsigned long)owner | RWSEM_RD_NONSPINNABLE); 473 lockevent_inc(rwsem_opt_norspin); 474 } 475 __rwsem_set_reader_owned(sem, owner); 476 } 477 478 /* 479 * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the 480 * queue. We know that the woken will be at least 1 as we accounted 481 * for above. Note we increment the 'active part' of the count by the 482 * number of readers before waking any processes up. 483 * 484 * This is an adaptation of the phase-fair R/W locks where at the 485 * reader phase (first waiter is a reader), all readers are eligible 486 * to acquire the lock at the same time irrespective of their order 487 * in the queue. The writers acquire the lock according to their 488 * order in the queue. 489 * 490 * We have to do wakeup in 2 passes to prevent the possibility that 491 * the reader count may be decremented before it is incremented. It 492 * is because the to-be-woken waiter may not have slept yet. So it 493 * may see waiter->task got cleared, finish its critical section and 494 * do an unlock before the reader count increment. 495 * 496 * 1) Collect the read-waiters in a separate list, count them and 497 * fully increment the reader count in rwsem. 498 * 2) For each waiters in the new list, clear waiter->task and 499 * put them into wake_q to be woken up later. 500 */ 501 INIT_LIST_HEAD(&wlist); 502 list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { 503 if (waiter->type == RWSEM_WAITING_FOR_WRITE) 504 continue; 505 506 woken++; 507 list_move_tail(&waiter->list, &wlist); 508 509 /* 510 * Limit # of readers that can be woken up per wakeup call. 511 */ 512 if (woken >= MAX_READERS_WAKEUP) 513 break; 514 } 515 516 adjustment = woken * RWSEM_READER_BIAS - adjustment; 517 lockevent_cond_inc(rwsem_wake_reader, woken); 518 if (list_empty(&sem->wait_list)) { 519 /* hit end of list above */ 520 adjustment -= RWSEM_FLAG_WAITERS; 521 } 522 523 /* 524 * When we've woken a reader, we no longer need to force writers 525 * to give up the lock and we can clear HANDOFF. 526 */ 527 if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF)) 528 adjustment -= RWSEM_FLAG_HANDOFF; 529 530 if (adjustment) 531 atomic_long_add(adjustment, &sem->count); 532 533 /* 2nd pass */ 534 list_for_each_entry_safe(waiter, tmp, &wlist, list) { 535 struct task_struct *tsk; 536 537 tsk = waiter->task; 538 get_task_struct(tsk); 539 540 /* 541 * Ensure calling get_task_struct() before setting the reader 542 * waiter to nil such that rwsem_down_read_slowpath() cannot 543 * race with do_exit() by always holding a reference count 544 * to the task to wakeup. 545 */ 546 smp_store_release(&waiter->task, NULL); 547 /* 548 * Ensure issuing the wakeup (either by us or someone else) 549 * after setting the reader waiter to nil. 550 */ 551 wake_q_add_safe(wake_q, tsk); 552 } 553 } 554 555 /* 556 * This function must be called with the sem->wait_lock held to prevent 557 * race conditions between checking the rwsem wait list and setting the 558 * sem->count accordingly. 559 * 560 * If wstate is WRITER_HANDOFF, it will make sure that either the handoff 561 * bit is set or the lock is acquired with handoff bit cleared. 562 */ 563 static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, 564 enum writer_wait_state wstate) 565 { 566 long count, new; 567 568 lockdep_assert_held(&sem->wait_lock); 569 570 count = atomic_long_read(&sem->count); 571 do { 572 bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); 573 574 if (has_handoff && wstate == WRITER_NOT_FIRST) 575 return false; 576 577 new = count; 578 579 if (count & RWSEM_LOCK_MASK) { 580 if (has_handoff || (wstate != WRITER_HANDOFF)) 581 return false; 582 583 new |= RWSEM_FLAG_HANDOFF; 584 } else { 585 new |= RWSEM_WRITER_LOCKED; 586 new &= ~RWSEM_FLAG_HANDOFF; 587 588 if (list_is_singular(&sem->wait_list)) 589 new &= ~RWSEM_FLAG_WAITERS; 590 } 591 } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)); 592 593 /* 594 * We have either acquired the lock with handoff bit cleared or 595 * set the handoff bit. 596 */ 597 if (new & RWSEM_FLAG_HANDOFF) 598 return false; 599 600 rwsem_set_owner(sem); 601 return true; 602 } 603 604 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER 605 /* 606 * Try to acquire read lock before the reader is put on wait queue. 607 * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff 608 * is ongoing. 609 */ 610 static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem) 611 { 612 long count = atomic_long_read(&sem->count); 613 614 if (count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF)) 615 return false; 616 617 count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count); 618 if (!(count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { 619 rwsem_set_reader_owned(sem); 620 lockevent_inc(rwsem_opt_rlock); 621 return true; 622 } 623 624 /* Back out the change */ 625 atomic_long_add(-RWSEM_READER_BIAS, &sem->count); 626 return false; 627 } 628 629 /* 630 * Try to acquire write lock before the writer has been put on wait queue. 631 */ 632 static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) 633 { 634 long count = atomic_long_read(&sem->count); 635 636 while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) { 637 if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, 638 count | RWSEM_WRITER_LOCKED)) { 639 rwsem_set_owner(sem); 640 lockevent_inc(rwsem_opt_wlock); 641 return true; 642 } 643 } 644 return false; 645 } 646 647 static inline bool owner_on_cpu(struct task_struct *owner) 648 { 649 /* 650 * As lock holder preemption issue, we both skip spinning if 651 * task is not on cpu or its cpu is preempted 652 */ 653 return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); 654 } 655 656 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, 657 unsigned long nonspinnable) 658 { 659 struct task_struct *owner; 660 unsigned long flags; 661 bool ret = true; 662 663 BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE)); 664 665 if (need_resched()) { 666 lockevent_inc(rwsem_opt_fail); 667 return false; 668 } 669 670 preempt_disable(); 671 rcu_read_lock(); 672 owner = rwsem_owner_flags(sem, &flags); 673 /* 674 * Don't check the read-owner as the entry may be stale. 675 */ 676 if ((flags & nonspinnable) || 677 (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner))) 678 ret = false; 679 rcu_read_unlock(); 680 preempt_enable(); 681 682 lockevent_cond_inc(rwsem_opt_fail, !ret); 683 return ret; 684 } 685 686 /* 687 * The rwsem_spin_on_owner() function returns the folowing 4 values 688 * depending on the lock owner state. 689 * OWNER_NULL : owner is currently NULL 690 * OWNER_WRITER: when owner changes and is a writer 691 * OWNER_READER: when owner changes and the new owner may be a reader. 692 * OWNER_NONSPINNABLE: 693 * when optimistic spinning has to stop because either the 694 * owner stops running, is unknown, or its timeslice has 695 * been used up. 696 */ 697 enum owner_state { 698 OWNER_NULL = 1 << 0, 699 OWNER_WRITER = 1 << 1, 700 OWNER_READER = 1 << 2, 701 OWNER_NONSPINNABLE = 1 << 3, 702 }; 703 #define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER) 704 705 static inline enum owner_state 706 rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long nonspinnable) 707 { 708 if (flags & nonspinnable) 709 return OWNER_NONSPINNABLE; 710 711 if (flags & RWSEM_READER_OWNED) 712 return OWNER_READER; 713 714 return owner ? OWNER_WRITER : OWNER_NULL; 715 } 716 717 static noinline enum owner_state 718 rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) 719 { 720 struct task_struct *new, *owner; 721 unsigned long flags, new_flags; 722 enum owner_state state; 723 724 owner = rwsem_owner_flags(sem, &flags); 725 state = rwsem_owner_state(owner, flags, nonspinnable); 726 if (state != OWNER_WRITER) 727 return state; 728 729 rcu_read_lock(); 730 for (;;) { 731 /* 732 * When a waiting writer set the handoff flag, it may spin 733 * on the owner as well. Once that writer acquires the lock, 734 * we can spin on it. So we don't need to quit even when the 735 * handoff bit is set. 736 */ 737 new = rwsem_owner_flags(sem, &new_flags); 738 if ((new != owner) || (new_flags != flags)) { 739 state = rwsem_owner_state(new, new_flags, nonspinnable); 740 break; 741 } 742 743 /* 744 * Ensure we emit the owner->on_cpu, dereference _after_ 745 * checking sem->owner still matches owner, if that fails, 746 * owner might point to free()d memory, if it still matches, 747 * the rcu_read_lock() ensures the memory stays valid. 748 */ 749 barrier(); 750 751 if (need_resched() || !owner_on_cpu(owner)) { 752 state = OWNER_NONSPINNABLE; 753 break; 754 } 755 756 cpu_relax(); 757 } 758 rcu_read_unlock(); 759 760 return state; 761 } 762 763 /* 764 * Calculate reader-owned rwsem spinning threshold for writer 765 * 766 * The more readers own the rwsem, the longer it will take for them to 767 * wind down and free the rwsem. So the empirical formula used to 768 * determine the actual spinning time limit here is: 769 * 770 * Spinning threshold = (10 + nr_readers/2)us 771 * 772 * The limit is capped to a maximum of 25us (30 readers). This is just 773 * a heuristic and is subjected to change in the future. 774 */ 775 static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem) 776 { 777 long count = atomic_long_read(&sem->count); 778 int readers = count >> RWSEM_READER_SHIFT; 779 u64 delta; 780 781 if (readers > 30) 782 readers = 30; 783 delta = (20 + readers) * NSEC_PER_USEC / 2; 784 785 return sched_clock() + delta; 786 } 787 788 static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) 789 { 790 bool taken = false; 791 int prev_owner_state = OWNER_NULL; 792 int loop = 0; 793 u64 rspin_threshold = 0; 794 unsigned long nonspinnable = wlock ? RWSEM_WR_NONSPINNABLE 795 : RWSEM_RD_NONSPINNABLE; 796 797 preempt_disable(); 798 799 /* sem->wait_lock should not be held when doing optimistic spinning */ 800 if (!osq_lock(&sem->osq)) 801 goto done; 802 803 /* 804 * Optimistically spin on the owner field and attempt to acquire the 805 * lock whenever the owner changes. Spinning will be stopped when: 806 * 1) the owning writer isn't running; or 807 * 2) readers own the lock and spinning time has exceeded limit. 808 */ 809 for (;;) { 810 enum owner_state owner_state; 811 812 owner_state = rwsem_spin_on_owner(sem, nonspinnable); 813 if (!(owner_state & OWNER_SPINNABLE)) 814 break; 815 816 /* 817 * Try to acquire the lock 818 */ 819 taken = wlock ? rwsem_try_write_lock_unqueued(sem) 820 : rwsem_try_read_lock_unqueued(sem); 821 822 if (taken) 823 break; 824 825 /* 826 * Time-based reader-owned rwsem optimistic spinning 827 */ 828 if (wlock && (owner_state == OWNER_READER)) { 829 /* 830 * Re-initialize rspin_threshold every time when 831 * the owner state changes from non-reader to reader. 832 * This allows a writer to steal the lock in between 833 * 2 reader phases and have the threshold reset at 834 * the beginning of the 2nd reader phase. 835 */ 836 if (prev_owner_state != OWNER_READER) { 837 if (rwsem_test_oflags(sem, nonspinnable)) 838 break; 839 rspin_threshold = rwsem_rspin_threshold(sem); 840 loop = 0; 841 } 842 843 /* 844 * Check time threshold once every 16 iterations to 845 * avoid calling sched_clock() too frequently so 846 * as to reduce the average latency between the times 847 * when the lock becomes free and when the spinner 848 * is ready to do a trylock. 849 */ 850 else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) { 851 rwsem_set_nonspinnable(sem); 852 lockevent_inc(rwsem_opt_nospin); 853 break; 854 } 855 } 856 857 /* 858 * An RT task cannot do optimistic spinning if it cannot 859 * be sure the lock holder is running or live-lock may 860 * happen if the current task and the lock holder happen 861 * to run in the same CPU. However, aborting optimistic 862 * spinning while a NULL owner is detected may miss some 863 * opportunity where spinning can continue without causing 864 * problem. 865 * 866 * There are 2 possible cases where an RT task may be able 867 * to continue spinning. 868 * 869 * 1) The lock owner is in the process of releasing the 870 * lock, sem->owner is cleared but the lock has not 871 * been released yet. 872 * 2) The lock was free and owner cleared, but another 873 * task just comes in and acquire the lock before 874 * we try to get it. The new owner may be a spinnable 875 * writer. 876 * 877 * To take advantage of two scenarios listed agove, the RT 878 * task is made to retry one more time to see if it can 879 * acquire the lock or continue spinning on the new owning 880 * writer. Of course, if the time lag is long enough or the 881 * new owner is not a writer or spinnable, the RT task will 882 * quit spinning. 883 * 884 * If the owner is a writer, the need_resched() check is 885 * done inside rwsem_spin_on_owner(). If the owner is not 886 * a writer, need_resched() check needs to be done here. 887 */ 888 if (owner_state != OWNER_WRITER) { 889 if (need_resched()) 890 break; 891 if (rt_task(current) && 892 (prev_owner_state != OWNER_WRITER)) 893 break; 894 } 895 prev_owner_state = owner_state; 896 897 /* 898 * The cpu_relax() call is a compiler barrier which forces 899 * everything in this loop to be re-loaded. We don't need 900 * memory barriers as we'll eventually observe the right 901 * values at the cost of a few extra spins. 902 */ 903 cpu_relax(); 904 } 905 osq_unlock(&sem->osq); 906 done: 907 preempt_enable(); 908 lockevent_cond_inc(rwsem_opt_fail, !taken); 909 return taken; 910 } 911 912 /* 913 * Clear the owner's RWSEM_WR_NONSPINNABLE bit if it is set. This should 914 * only be called when the reader count reaches 0. 915 * 916 * This give writers better chance to acquire the rwsem first before 917 * readers when the rwsem was being held by readers for a relatively long 918 * period of time. Race can happen that an optimistic spinner may have 919 * just stolen the rwsem and set the owner, but just clearing the 920 * RWSEM_WR_NONSPINNABLE bit will do no harm anyway. 921 */ 922 static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) 923 { 924 if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE)) 925 atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner); 926 } 927 928 /* 929 * This function is called when the reader fails to acquire the lock via 930 * optimistic spinning. In this case we will still attempt to do a trylock 931 * when comparing the rwsem state right now with the state when entering 932 * the slowpath indicates that the reader is still in a valid reader phase. 933 * This happens when the following conditions are true: 934 * 935 * 1) The lock is currently reader owned, and 936 * 2) The lock is previously not reader-owned or the last read owner changes. 937 * 938 * In the former case, we have transitioned from a writer phase to a 939 * reader-phase while spinning. In the latter case, it means the reader 940 * phase hasn't ended when we entered the optimistic spinning loop. In 941 * both cases, the reader is eligible to acquire the lock. This is the 942 * secondary path where a read lock is acquired optimistically. 943 * 944 * The reader non-spinnable bit wasn't set at time of entry or it will 945 * not be here at all. 946 */ 947 static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, 948 unsigned long last_rowner) 949 { 950 unsigned long owner = atomic_long_read(&sem->owner); 951 952 if (!(owner & RWSEM_READER_OWNED)) 953 return false; 954 955 if (((owner ^ last_rowner) & ~RWSEM_OWNER_FLAGS_MASK) && 956 rwsem_try_read_lock_unqueued(sem)) { 957 lockevent_inc(rwsem_opt_rlock2); 958 lockevent_add(rwsem_opt_fail, -1); 959 return true; 960 } 961 return false; 962 } 963 #else 964 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, 965 unsigned long nonspinnable) 966 { 967 return false; 968 } 969 970 static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) 971 { 972 return false; 973 } 974 975 static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { } 976 977 static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, 978 unsigned long last_rowner) 979 { 980 return false; 981 } 982 983 static inline int 984 rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) 985 { 986 return 0; 987 } 988 #define OWNER_NULL 1 989 #endif 990 991 /* 992 * Wait for the read lock to be granted 993 */ 994 static struct rw_semaphore __sched * 995 rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) 996 { 997 long count, adjustment = -RWSEM_READER_BIAS; 998 struct rwsem_waiter waiter; 999 DEFINE_WAKE_Q(wake_q); 1000 bool wake = false; 1001 1002 /* 1003 * Save the current read-owner of rwsem, if available, and the 1004 * reader nonspinnable bit. 1005 */ 1006 waiter.last_rowner = atomic_long_read(&sem->owner); 1007 if (!(waiter.last_rowner & RWSEM_READER_OWNED)) 1008 waiter.last_rowner &= RWSEM_RD_NONSPINNABLE; 1009 1010 if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE)) 1011 goto queue; 1012 1013 /* 1014 * Undo read bias from down_read() and do optimistic spinning. 1015 */ 1016 atomic_long_add(-RWSEM_READER_BIAS, &sem->count); 1017 adjustment = 0; 1018 if (rwsem_optimistic_spin(sem, false)) { 1019 /* rwsem_optimistic_spin() implies ACQUIRE on success */ 1020 /* 1021 * Wake up other readers in the wait list if the front 1022 * waiter is a reader. 1023 */ 1024 if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) { 1025 raw_spin_lock_irq(&sem->wait_lock); 1026 if (!list_empty(&sem->wait_list)) 1027 rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, 1028 &wake_q); 1029 raw_spin_unlock_irq(&sem->wait_lock); 1030 wake_up_q(&wake_q); 1031 } 1032 return sem; 1033 } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) { 1034 /* rwsem_reader_phase_trylock() implies ACQUIRE on success */ 1035 return sem; 1036 } 1037 1038 queue: 1039 waiter.task = current; 1040 waiter.type = RWSEM_WAITING_FOR_READ; 1041 waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; 1042 1043 raw_spin_lock_irq(&sem->wait_lock); 1044 if (list_empty(&sem->wait_list)) { 1045 /* 1046 * In case the wait queue is empty and the lock isn't owned 1047 * by a writer or has the handoff bit set, this reader can 1048 * exit the slowpath and return immediately as its 1049 * RWSEM_READER_BIAS has already been set in the count. 1050 */ 1051 if (adjustment && !(atomic_long_read(&sem->count) & 1052 (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { 1053 /* Provide lock ACQUIRE */ 1054 smp_acquire__after_ctrl_dep(); 1055 raw_spin_unlock_irq(&sem->wait_lock); 1056 rwsem_set_reader_owned(sem); 1057 lockevent_inc(rwsem_rlock_fast); 1058 return sem; 1059 } 1060 adjustment += RWSEM_FLAG_WAITERS; 1061 } 1062 list_add_tail(&waiter.list, &sem->wait_list); 1063 1064 /* we're now waiting on the lock, but no longer actively locking */ 1065 if (adjustment) 1066 count = atomic_long_add_return(adjustment, &sem->count); 1067 else 1068 count = atomic_long_read(&sem->count); 1069 1070 /* 1071 * If there are no active locks, wake the front queued process(es). 1072 * 1073 * If there are no writers and we are first in the queue, 1074 * wake our own waiter to join the existing active readers ! 1075 */ 1076 if (!(count & RWSEM_LOCK_MASK)) { 1077 clear_wr_nonspinnable(sem); 1078 wake = true; 1079 } 1080 if (wake || (!(count & RWSEM_WRITER_MASK) && 1081 (adjustment & RWSEM_FLAG_WAITERS))) 1082 rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); 1083 1084 raw_spin_unlock_irq(&sem->wait_lock); 1085 wake_up_q(&wake_q); 1086 1087 /* wait to be given the lock */ 1088 for (;;) { 1089 set_current_state(state); 1090 if (!smp_load_acquire(&waiter.task)) { 1091 /* Matches rwsem_mark_wake()'s smp_store_release(). */ 1092 break; 1093 } 1094 if (signal_pending_state(state, current)) { 1095 raw_spin_lock_irq(&sem->wait_lock); 1096 if (waiter.task) 1097 goto out_nolock; 1098 raw_spin_unlock_irq(&sem->wait_lock); 1099 /* Ordered by sem->wait_lock against rwsem_mark_wake(). */ 1100 break; 1101 } 1102 schedule(); 1103 lockevent_inc(rwsem_sleep_reader); 1104 } 1105 1106 __set_current_state(TASK_RUNNING); 1107 lockevent_inc(rwsem_rlock); 1108 return sem; 1109 1110 out_nolock: 1111 list_del(&waiter.list); 1112 if (list_empty(&sem->wait_list)) { 1113 atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF, 1114 &sem->count); 1115 } 1116 raw_spin_unlock_irq(&sem->wait_lock); 1117 __set_current_state(TASK_RUNNING); 1118 lockevent_inc(rwsem_rlock_fail); 1119 return ERR_PTR(-EINTR); 1120 } 1121 1122 /* 1123 * This function is called by the a write lock owner. So the owner value 1124 * won't get changed by others. 1125 */ 1126 static inline void rwsem_disable_reader_optspin(struct rw_semaphore *sem, 1127 bool disable) 1128 { 1129 if (unlikely(disable)) { 1130 atomic_long_or(RWSEM_RD_NONSPINNABLE, &sem->owner); 1131 lockevent_inc(rwsem_opt_norspin); 1132 } 1133 } 1134 1135 /* 1136 * Wait until we successfully acquire the write lock 1137 */ 1138 static struct rw_semaphore * 1139 rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) 1140 { 1141 long count; 1142 bool disable_rspin; 1143 enum writer_wait_state wstate; 1144 struct rwsem_waiter waiter; 1145 struct rw_semaphore *ret = sem; 1146 DEFINE_WAKE_Q(wake_q); 1147 1148 /* do optimistic spinning and steal lock if possible */ 1149 if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) && 1150 rwsem_optimistic_spin(sem, true)) { 1151 /* rwsem_optimistic_spin() implies ACQUIRE on success */ 1152 return sem; 1153 } 1154 1155 /* 1156 * Disable reader optimistic spinning for this rwsem after 1157 * acquiring the write lock when the setting of the nonspinnable 1158 * bits are observed. 1159 */ 1160 disable_rspin = atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE; 1161 1162 /* 1163 * Optimistic spinning failed, proceed to the slowpath 1164 * and block until we can acquire the sem. 1165 */ 1166 waiter.task = current; 1167 waiter.type = RWSEM_WAITING_FOR_WRITE; 1168 waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; 1169 1170 raw_spin_lock_irq(&sem->wait_lock); 1171 1172 /* account for this before adding a new element to the list */ 1173 wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST; 1174 1175 list_add_tail(&waiter.list, &sem->wait_list); 1176 1177 /* we're now waiting on the lock */ 1178 if (wstate == WRITER_NOT_FIRST) { 1179 count = atomic_long_read(&sem->count); 1180 1181 /* 1182 * If there were already threads queued before us and: 1183 * 1) there are no no active locks, wake the front 1184 * queued process(es) as the handoff bit might be set. 1185 * 2) there are no active writers and some readers, the lock 1186 * must be read owned; so we try to wake any read lock 1187 * waiters that were queued ahead of us. 1188 */ 1189 if (count & RWSEM_WRITER_MASK) 1190 goto wait; 1191 1192 rwsem_mark_wake(sem, (count & RWSEM_READER_MASK) 1193 ? RWSEM_WAKE_READERS 1194 : RWSEM_WAKE_ANY, &wake_q); 1195 1196 if (!wake_q_empty(&wake_q)) { 1197 /* 1198 * We want to minimize wait_lock hold time especially 1199 * when a large number of readers are to be woken up. 1200 */ 1201 raw_spin_unlock_irq(&sem->wait_lock); 1202 wake_up_q(&wake_q); 1203 wake_q_init(&wake_q); /* Used again, reinit */ 1204 raw_spin_lock_irq(&sem->wait_lock); 1205 } 1206 } else { 1207 atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); 1208 } 1209 1210 wait: 1211 /* wait until we successfully acquire the lock */ 1212 set_current_state(state); 1213 for (;;) { 1214 if (rwsem_try_write_lock(sem, wstate)) { 1215 /* rwsem_try_write_lock() implies ACQUIRE on success */ 1216 break; 1217 } 1218 1219 raw_spin_unlock_irq(&sem->wait_lock); 1220 1221 /* 1222 * After setting the handoff bit and failing to acquire 1223 * the lock, attempt to spin on owner to accelerate lock 1224 * transfer. If the previous owner is a on-cpu writer and it 1225 * has just released the lock, OWNER_NULL will be returned. 1226 * In this case, we attempt to acquire the lock again 1227 * without sleeping. 1228 */ 1229 if ((wstate == WRITER_HANDOFF) && 1230 (rwsem_spin_on_owner(sem, 0) == OWNER_NULL)) 1231 goto trylock_again; 1232 1233 /* Block until there are no active lockers. */ 1234 for (;;) { 1235 if (signal_pending_state(state, current)) 1236 goto out_nolock; 1237 1238 schedule(); 1239 lockevent_inc(rwsem_sleep_writer); 1240 set_current_state(state); 1241 /* 1242 * If HANDOFF bit is set, unconditionally do 1243 * a trylock. 1244 */ 1245 if (wstate == WRITER_HANDOFF) 1246 break; 1247 1248 if ((wstate == WRITER_NOT_FIRST) && 1249 (rwsem_first_waiter(sem) == &waiter)) 1250 wstate = WRITER_FIRST; 1251 1252 count = atomic_long_read(&sem->count); 1253 if (!(count & RWSEM_LOCK_MASK)) 1254 break; 1255 1256 /* 1257 * The setting of the handoff bit is deferred 1258 * until rwsem_try_write_lock() is called. 1259 */ 1260 if ((wstate == WRITER_FIRST) && (rt_task(current) || 1261 time_after(jiffies, waiter.timeout))) { 1262 wstate = WRITER_HANDOFF; 1263 lockevent_inc(rwsem_wlock_handoff); 1264 break; 1265 } 1266 } 1267 trylock_again: 1268 raw_spin_lock_irq(&sem->wait_lock); 1269 } 1270 __set_current_state(TASK_RUNNING); 1271 list_del(&waiter.list); 1272 rwsem_disable_reader_optspin(sem, disable_rspin); 1273 raw_spin_unlock_irq(&sem->wait_lock); 1274 lockevent_inc(rwsem_wlock); 1275 1276 return ret; 1277 1278 out_nolock: 1279 __set_current_state(TASK_RUNNING); 1280 raw_spin_lock_irq(&sem->wait_lock); 1281 list_del(&waiter.list); 1282 1283 if (unlikely(wstate == WRITER_HANDOFF)) 1284 atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count); 1285 1286 if (list_empty(&sem->wait_list)) 1287 atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); 1288 else 1289 rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); 1290 raw_spin_unlock_irq(&sem->wait_lock); 1291 wake_up_q(&wake_q); 1292 lockevent_inc(rwsem_wlock_fail); 1293 1294 return ERR_PTR(-EINTR); 1295 } 1296 1297 /* 1298 * handle waking up a waiter on the semaphore 1299 * - up_read/up_write has decremented the active part of count if we come here 1300 */ 1301 static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count) 1302 { 1303 unsigned long flags; 1304 DEFINE_WAKE_Q(wake_q); 1305 1306 raw_spin_lock_irqsave(&sem->wait_lock, flags); 1307 1308 if (!list_empty(&sem->wait_list)) 1309 rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); 1310 1311 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 1312 wake_up_q(&wake_q); 1313 1314 return sem; 1315 } 1316 1317 /* 1318 * downgrade a write lock into a read lock 1319 * - caller incremented waiting part of count and discovered it still negative 1320 * - just wake up any readers at the front of the queue 1321 */ 1322 static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) 1323 { 1324 unsigned long flags; 1325 DEFINE_WAKE_Q(wake_q); 1326 1327 raw_spin_lock_irqsave(&sem->wait_lock, flags); 1328 1329 if (!list_empty(&sem->wait_list)) 1330 rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); 1331 1332 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 1333 wake_up_q(&wake_q); 1334 1335 return sem; 1336 } 1337 1338 /* 1339 * lock for reading 1340 */ 1341 inline void __down_read(struct rw_semaphore *sem) 1342 { 1343 if (!rwsem_read_trylock(sem)) { 1344 rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); 1345 DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); 1346 } else { 1347 rwsem_set_reader_owned(sem); 1348 } 1349 } 1350 1351 static inline int __down_read_killable(struct rw_semaphore *sem) 1352 { 1353 if (!rwsem_read_trylock(sem)) { 1354 if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) 1355 return -EINTR; 1356 DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); 1357 } else { 1358 rwsem_set_reader_owned(sem); 1359 } 1360 return 0; 1361 } 1362 1363 static inline int __down_read_trylock(struct rw_semaphore *sem) 1364 { 1365 long tmp; 1366 1367 DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); 1368 1369 /* 1370 * Optimize for the case when the rwsem is not locked at all. 1371 */ 1372 tmp = RWSEM_UNLOCKED_VALUE; 1373 do { 1374 if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, 1375 tmp + RWSEM_READER_BIAS)) { 1376 rwsem_set_reader_owned(sem); 1377 return 1; 1378 } 1379 } while (!(tmp & RWSEM_READ_FAILED_MASK)); 1380 return 0; 1381 } 1382 1383 /* 1384 * lock for writing 1385 */ 1386 static inline void __down_write(struct rw_semaphore *sem) 1387 { 1388 long tmp = RWSEM_UNLOCKED_VALUE; 1389 1390 if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, 1391 RWSEM_WRITER_LOCKED))) 1392 rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE); 1393 else 1394 rwsem_set_owner(sem); 1395 } 1396 1397 static inline int __down_write_killable(struct rw_semaphore *sem) 1398 { 1399 long tmp = RWSEM_UNLOCKED_VALUE; 1400 1401 if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, 1402 RWSEM_WRITER_LOCKED))) { 1403 if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE))) 1404 return -EINTR; 1405 } else { 1406 rwsem_set_owner(sem); 1407 } 1408 return 0; 1409 } 1410 1411 static inline int __down_write_trylock(struct rw_semaphore *sem) 1412 { 1413 long tmp; 1414 1415 DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); 1416 1417 tmp = RWSEM_UNLOCKED_VALUE; 1418 if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, 1419 RWSEM_WRITER_LOCKED)) { 1420 rwsem_set_owner(sem); 1421 return true; 1422 } 1423 return false; 1424 } 1425 1426 /* 1427 * unlock after reading 1428 */ 1429 inline void __up_read(struct rw_semaphore *sem) 1430 { 1431 long tmp; 1432 1433 DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); 1434 DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); 1435 1436 rwsem_clear_reader_owned(sem); 1437 tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); 1438 DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); 1439 if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == 1440 RWSEM_FLAG_WAITERS)) { 1441 clear_wr_nonspinnable(sem); 1442 rwsem_wake(sem, tmp); 1443 } 1444 } 1445 1446 /* 1447 * unlock after writing 1448 */ 1449 static inline void __up_write(struct rw_semaphore *sem) 1450 { 1451 long tmp; 1452 1453 DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); 1454 /* 1455 * sem->owner may differ from current if the ownership is transferred 1456 * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits. 1457 */ 1458 DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && 1459 !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); 1460 1461 rwsem_clear_owner(sem); 1462 tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); 1463 if (unlikely(tmp & RWSEM_FLAG_WAITERS)) 1464 rwsem_wake(sem, tmp); 1465 } 1466 1467 /* 1468 * downgrade write lock to read lock 1469 */ 1470 static inline void __downgrade_write(struct rw_semaphore *sem) 1471 { 1472 long tmp; 1473 1474 /* 1475 * When downgrading from exclusive to shared ownership, 1476 * anything inside the write-locked region cannot leak 1477 * into the read side. In contrast, anything in the 1478 * read-locked region is ok to be re-ordered into the 1479 * write side. As such, rely on RELEASE semantics. 1480 */ 1481 DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem); 1482 tmp = atomic_long_fetch_add_release( 1483 -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); 1484 rwsem_set_reader_owned(sem); 1485 if (tmp & RWSEM_FLAG_WAITERS) 1486 rwsem_downgrade_wake(sem); 1487 } 1488 1489 /* 1490 * lock for reading 1491 */ 1492 void __sched down_read(struct rw_semaphore *sem) 1493 { 1494 might_sleep(); 1495 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); 1496 1497 LOCK_CONTENDED(sem, __down_read_trylock, __down_read); 1498 } 1499 EXPORT_SYMBOL(down_read); 1500 1501 int __sched down_read_killable(struct rw_semaphore *sem) 1502 { 1503 might_sleep(); 1504 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); 1505 1506 if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { 1507 rwsem_release(&sem->dep_map, _RET_IP_); 1508 return -EINTR; 1509 } 1510 1511 return 0; 1512 } 1513 EXPORT_SYMBOL(down_read_killable); 1514 1515 /* 1516 * trylock for reading -- returns 1 if successful, 0 if contention 1517 */ 1518 int down_read_trylock(struct rw_semaphore *sem) 1519 { 1520 int ret = __down_read_trylock(sem); 1521 1522 if (ret == 1) 1523 rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); 1524 return ret; 1525 } 1526 EXPORT_SYMBOL(down_read_trylock); 1527 1528 /* 1529 * lock for writing 1530 */ 1531 void __sched down_write(struct rw_semaphore *sem) 1532 { 1533 might_sleep(); 1534 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); 1535 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 1536 } 1537 EXPORT_SYMBOL(down_write); 1538 1539 /* 1540 * lock for writing 1541 */ 1542 int __sched down_write_killable(struct rw_semaphore *sem) 1543 { 1544 might_sleep(); 1545 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); 1546 1547 if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, 1548 __down_write_killable)) { 1549 rwsem_release(&sem->dep_map, _RET_IP_); 1550 return -EINTR; 1551 } 1552 1553 return 0; 1554 } 1555 EXPORT_SYMBOL(down_write_killable); 1556 1557 /* 1558 * trylock for writing -- returns 1 if successful, 0 if contention 1559 */ 1560 int down_write_trylock(struct rw_semaphore *sem) 1561 { 1562 int ret = __down_write_trylock(sem); 1563 1564 if (ret == 1) 1565 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); 1566 1567 return ret; 1568 } 1569 EXPORT_SYMBOL(down_write_trylock); 1570 1571 /* 1572 * release a read lock 1573 */ 1574 void up_read(struct rw_semaphore *sem) 1575 { 1576 rwsem_release(&sem->dep_map, _RET_IP_); 1577 __up_read(sem); 1578 } 1579 EXPORT_SYMBOL(up_read); 1580 1581 /* 1582 * release a write lock 1583 */ 1584 void up_write(struct rw_semaphore *sem) 1585 { 1586 rwsem_release(&sem->dep_map, _RET_IP_); 1587 __up_write(sem); 1588 } 1589 EXPORT_SYMBOL(up_write); 1590 1591 /* 1592 * downgrade write lock to read lock 1593 */ 1594 void downgrade_write(struct rw_semaphore *sem) 1595 { 1596 lock_downgrade(&sem->dep_map, _RET_IP_); 1597 __downgrade_write(sem); 1598 } 1599 EXPORT_SYMBOL(downgrade_write); 1600 1601 #ifdef CONFIG_DEBUG_LOCK_ALLOC 1602 1603 void down_read_nested(struct rw_semaphore *sem, int subclass) 1604 { 1605 might_sleep(); 1606 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); 1607 LOCK_CONTENDED(sem, __down_read_trylock, __down_read); 1608 } 1609 EXPORT_SYMBOL(down_read_nested); 1610 1611 void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) 1612 { 1613 might_sleep(); 1614 rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); 1615 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 1616 } 1617 EXPORT_SYMBOL(_down_write_nest_lock); 1618 1619 void down_read_non_owner(struct rw_semaphore *sem) 1620 { 1621 might_sleep(); 1622 __down_read(sem); 1623 __rwsem_set_reader_owned(sem, NULL); 1624 } 1625 EXPORT_SYMBOL(down_read_non_owner); 1626 1627 void down_write_nested(struct rw_semaphore *sem, int subclass) 1628 { 1629 might_sleep(); 1630 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); 1631 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 1632 } 1633 EXPORT_SYMBOL(down_write_nested); 1634 1635 int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) 1636 { 1637 might_sleep(); 1638 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); 1639 1640 if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, 1641 __down_write_killable)) { 1642 rwsem_release(&sem->dep_map, _RET_IP_); 1643 return -EINTR; 1644 } 1645 1646 return 0; 1647 } 1648 EXPORT_SYMBOL(down_write_killable_nested); 1649 1650 void up_read_non_owner(struct rw_semaphore *sem) 1651 { 1652 DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); 1653 __up_read(sem); 1654 } 1655 EXPORT_SYMBOL(up_read_non_owner); 1656 1657 #endif 1658