1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * RT-Mutexes: simple blocking mutual exclusion locks with PI support 4 * 5 * started by Ingo Molnar and Thomas Gleixner. 6 * 7 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 8 * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 9 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt 10 * Copyright (C) 2006 Esben Nielsen 11 * Adaptive Spinlocks: 12 * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich, 13 * and Peter Morreale, 14 * Adaptive Spinlocks simplification: 15 * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com> 16 * 17 * See Documentation/locking/rt-mutex-design.rst for details. 18 */ 19 #include <linux/sched.h> 20 #include <linux/sched/debug.h> 21 #include <linux/sched/deadline.h> 22 #include <linux/sched/signal.h> 23 #include <linux/sched/rt.h> 24 #include <linux/sched/wake_q.h> 25 #include <linux/ww_mutex.h> 26 27 #include <trace/events/lock.h> 28 29 #include "rtmutex_common.h" 30 31 #ifndef WW_RT 32 # define build_ww_mutex() (false) 33 # define ww_container_of(rtm) NULL 34 35 static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter, 36 struct rt_mutex *lock, 37 struct ww_acquire_ctx *ww_ctx, 38 struct wake_q_head *wake_q) 39 { 40 return 0; 41 } 42 43 static inline void __ww_mutex_check_waiters(struct rt_mutex *lock, 44 struct ww_acquire_ctx *ww_ctx, 45 struct wake_q_head *wake_q) 46 { 47 } 48 49 static inline void ww_mutex_lock_acquired(struct ww_mutex *lock, 50 struct ww_acquire_ctx *ww_ctx) 51 { 52 } 53 54 static inline int __ww_mutex_check_kill(struct rt_mutex *lock, 55 struct rt_mutex_waiter *waiter, 56 struct ww_acquire_ctx *ww_ctx) 57 { 58 return 0; 59 } 60 61 #else 62 # define build_ww_mutex() (true) 63 # define ww_container_of(rtm) container_of(rtm, struct ww_mutex, base) 64 # include "ww_mutex.h" 65 #endif 66 67 /* 68 * lock->owner state tracking: 69 * 70 * lock->owner holds the task_struct pointer of the owner. Bit 0 71 * is used to keep track of the "lock has waiters" state. 72 * 73 * owner bit0 74 * NULL 0 lock is free (fast acquire possible) 75 * NULL 1 lock is free and has waiters and the top waiter 76 * is going to take the lock* 77 * taskpointer 0 lock is held (fast release possible) 78 * taskpointer 1 lock is held and has waiters** 79 * 80 * The fast atomic compare exchange based acquire and release is only 81 * possible when bit 0 of lock->owner is 0. 82 * 83 * (*) It also can be a transitional state when grabbing the lock 84 * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock, 85 * we need to set the bit0 before looking at the lock, and the owner may be 86 * NULL in this small time, hence this can be a transitional state. 87 * 88 * (**) There is a small time when bit 0 is set but there are no 89 * waiters. This can happen when grabbing the lock in the slow path. 90 * To prevent a cmpxchg of the owner releasing the lock, we need to 91 * set this bit before looking at the lock. 92 */ 93 94 static __always_inline struct task_struct * 95 rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner) 96 { 97 unsigned long val = (unsigned long)owner; 98 99 if (rt_mutex_has_waiters(lock)) 100 val |= RT_MUTEX_HAS_WAITERS; 101 102 return (struct task_struct *)val; 103 } 104 105 static __always_inline void 106 rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) 107 { 108 /* 109 * lock->wait_lock is held but explicit acquire semantics are needed 110 * for a new lock owner so WRITE_ONCE is insufficient. 111 */ 112 xchg_acquire(&lock->owner, rt_mutex_owner_encode(lock, owner)); 113 } 114 115 static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock) 116 { 117 /* lock->wait_lock is held so the unlock provides release semantics. */ 118 WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL)); 119 } 120 121 static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) 122 { 123 lock->owner = (struct task_struct *) 124 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); 125 } 126 127 static __always_inline void 128 fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock) 129 { 130 unsigned long owner, *p = (unsigned long *) &lock->owner; 131 132 if (rt_mutex_has_waiters(lock)) 133 return; 134 135 /* 136 * The rbtree has no waiters enqueued, now make sure that the 137 * lock->owner still has the waiters bit set, otherwise the 138 * following can happen: 139 * 140 * CPU 0 CPU 1 CPU2 141 * l->owner=T1 142 * rt_mutex_lock(l) 143 * lock(l->lock) 144 * l->owner = T1 | HAS_WAITERS; 145 * enqueue(T2) 146 * boost() 147 * unlock(l->lock) 148 * block() 149 * 150 * rt_mutex_lock(l) 151 * lock(l->lock) 152 * l->owner = T1 | HAS_WAITERS; 153 * enqueue(T3) 154 * boost() 155 * unlock(l->lock) 156 * block() 157 * signal(->T2) signal(->T3) 158 * lock(l->lock) 159 * dequeue(T2) 160 * deboost() 161 * unlock(l->lock) 162 * lock(l->lock) 163 * dequeue(T3) 164 * ==> wait list is empty 165 * deboost() 166 * unlock(l->lock) 167 * lock(l->lock) 168 * fixup_rt_mutex_waiters() 169 * if (wait_list_empty(l) { 170 * l->owner = owner 171 * owner = l->owner & ~HAS_WAITERS; 172 * ==> l->owner = T1 173 * } 174 * lock(l->lock) 175 * rt_mutex_unlock(l) fixup_rt_mutex_waiters() 176 * if (wait_list_empty(l) { 177 * owner = l->owner & ~HAS_WAITERS; 178 * cmpxchg(l->owner, T1, NULL) 179 * ===> Success (l->owner = NULL) 180 * 181 * l->owner = owner 182 * ==> l->owner = T1 183 * } 184 * 185 * With the check for the waiter bit in place T3 on CPU2 will not 186 * overwrite. All tasks fiddling with the waiters bit are 187 * serialized by l->lock, so nothing else can modify the waiters 188 * bit. If the bit is set then nothing can change l->owner either 189 * so the simple RMW is safe. The cmpxchg() will simply fail if it 190 * happens in the middle of the RMW because the waiters bit is 191 * still set. 192 */ 193 owner = READ_ONCE(*p); 194 if (owner & RT_MUTEX_HAS_WAITERS) { 195 /* 196 * See rt_mutex_set_owner() and rt_mutex_clear_owner() on 197 * why xchg_acquire() is used for updating owner for 198 * locking and WRITE_ONCE() for unlocking. 199 * 200 * WRITE_ONCE() would work for the acquire case too, but 201 * in case that the lock acquisition failed it might 202 * force other lockers into the slow path unnecessarily. 203 */ 204 if (acquire_lock) 205 xchg_acquire(p, owner & ~RT_MUTEX_HAS_WAITERS); 206 else 207 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); 208 } 209 } 210 211 /* 212 * We can speed up the acquire/release, if there's no debugging state to be 213 * set up. 214 */ 215 #ifndef CONFIG_DEBUG_RT_MUTEXES 216 static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, 217 struct task_struct *old, 218 struct task_struct *new) 219 { 220 return try_cmpxchg_acquire(&lock->owner, &old, new); 221 } 222 223 static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock) 224 { 225 return rt_mutex_cmpxchg_acquire(lock, NULL, current); 226 } 227 228 static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, 229 struct task_struct *old, 230 struct task_struct *new) 231 { 232 return try_cmpxchg_release(&lock->owner, &old, new); 233 } 234 235 /* 236 * Callers must hold the ->wait_lock -- which is the whole purpose as we force 237 * all future threads that attempt to [Rmw] the lock to the slowpath. As such 238 * relaxed semantics suffice. 239 */ 240 static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) 241 { 242 unsigned long *p = (unsigned long *) &lock->owner; 243 unsigned long owner, new; 244 245 owner = READ_ONCE(*p); 246 do { 247 new = owner | RT_MUTEX_HAS_WAITERS; 248 } while (!try_cmpxchg_relaxed(p, &owner, new)); 249 250 /* 251 * The cmpxchg loop above is relaxed to avoid back-to-back ACQUIRE 252 * operations in the event of contention. Ensure the successful 253 * cmpxchg is visible. 254 */ 255 smp_mb__after_atomic(); 256 } 257 258 /* 259 * Safe fastpath aware unlock: 260 * 1) Clear the waiters bit 261 * 2) Drop lock->wait_lock 262 * 3) Try to unlock the lock with cmpxchg 263 */ 264 static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock, 265 unsigned long flags) 266 __releases(lock->wait_lock) 267 { 268 struct task_struct *owner = rt_mutex_owner(lock); 269 270 clear_rt_mutex_waiters(lock); 271 raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 272 /* 273 * If a new waiter comes in between the unlock and the cmpxchg 274 * we have two situations: 275 * 276 * unlock(wait_lock); 277 * lock(wait_lock); 278 * cmpxchg(p, owner, 0) == owner 279 * mark_rt_mutex_waiters(lock); 280 * acquire(lock); 281 * or: 282 * 283 * unlock(wait_lock); 284 * lock(wait_lock); 285 * mark_rt_mutex_waiters(lock); 286 * 287 * cmpxchg(p, owner, 0) != owner 288 * enqueue_waiter(); 289 * unlock(wait_lock); 290 * lock(wait_lock); 291 * wake waiter(); 292 * unlock(wait_lock); 293 * lock(wait_lock); 294 * acquire(lock); 295 */ 296 return rt_mutex_cmpxchg_release(lock, owner, NULL); 297 } 298 299 #else 300 static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, 301 struct task_struct *old, 302 struct task_struct *new) 303 { 304 return false; 305 306 } 307 308 static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock); 309 310 static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock) 311 { 312 /* 313 * With debug enabled rt_mutex_cmpxchg trylock() will always fail. 314 * 315 * Avoid unconditionally taking the slow path by using 316 * rt_mutex_slow_trylock() which is covered by the debug code and can 317 * acquire a non-contended rtmutex. 318 */ 319 return rt_mutex_slowtrylock(lock); 320 } 321 322 static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, 323 struct task_struct *old, 324 struct task_struct *new) 325 { 326 return false; 327 } 328 329 static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) 330 { 331 lock->owner = (struct task_struct *) 332 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); 333 } 334 335 /* 336 * Simple slow path only version: lock->owner is protected by lock->wait_lock. 337 */ 338 static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock, 339 unsigned long flags) 340 __releases(lock->wait_lock) 341 { 342 lock->owner = NULL; 343 raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 344 return true; 345 } 346 #endif 347 348 static __always_inline int __waiter_prio(struct task_struct *task) 349 { 350 int prio = task->prio; 351 352 if (!rt_or_dl_prio(prio)) 353 return DEFAULT_PRIO; 354 355 return prio; 356 } 357 358 /* 359 * Update the waiter->tree copy of the sort keys. 360 */ 361 static __always_inline void 362 waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) 363 { 364 lockdep_assert_held(&waiter->lock->wait_lock); 365 lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry)); 366 367 waiter->tree.prio = __waiter_prio(task); 368 waiter->tree.deadline = task->dl.deadline; 369 } 370 371 /* 372 * Update the waiter->pi_tree copy of the sort keys (from the tree copy). 373 */ 374 static __always_inline void 375 waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) 376 { 377 lockdep_assert_held(&waiter->lock->wait_lock); 378 lockdep_assert_held(&task->pi_lock); 379 lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry)); 380 381 waiter->pi_tree.prio = waiter->tree.prio; 382 waiter->pi_tree.deadline = waiter->tree.deadline; 383 } 384 385 /* 386 * Only use with rt_waiter_node_{less,equal}() 387 */ 388 #define task_to_waiter_node(p) \ 389 &(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } 390 #define task_to_waiter(p) \ 391 &(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) } 392 393 static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left, 394 struct rt_waiter_node *right) 395 { 396 if (left->prio < right->prio) 397 return 1; 398 399 /* 400 * If both waiters have dl_prio(), we check the deadlines of the 401 * associated tasks. 402 * If left waiter has a dl_prio(), and we didn't return 1 above, 403 * then right waiter has a dl_prio() too. 404 */ 405 if (dl_prio(left->prio)) 406 return dl_time_before(left->deadline, right->deadline); 407 408 return 0; 409 } 410 411 static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left, 412 struct rt_waiter_node *right) 413 { 414 if (left->prio != right->prio) 415 return 0; 416 417 /* 418 * If both waiters have dl_prio(), we check the deadlines of the 419 * associated tasks. 420 * If left waiter has a dl_prio(), and we didn't return 0 above, 421 * then right waiter has a dl_prio() too. 422 */ 423 if (dl_prio(left->prio)) 424 return left->deadline == right->deadline; 425 426 return 1; 427 } 428 429 static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, 430 struct rt_mutex_waiter *top_waiter) 431 { 432 if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree)) 433 return true; 434 435 #ifdef RT_MUTEX_BUILD_SPINLOCKS 436 /* 437 * Note that RT tasks are excluded from same priority (lateral) 438 * steals to prevent the introduction of an unbounded latency. 439 */ 440 if (rt_or_dl_prio(waiter->tree.prio)) 441 return false; 442 443 return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree); 444 #else 445 return false; 446 #endif 447 } 448 449 #define __node_2_waiter(node) \ 450 rb_entry((node), struct rt_mutex_waiter, tree.entry) 451 452 static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b) 453 { 454 struct rt_mutex_waiter *aw = __node_2_waiter(a); 455 struct rt_mutex_waiter *bw = __node_2_waiter(b); 456 457 if (rt_waiter_node_less(&aw->tree, &bw->tree)) 458 return 1; 459 460 if (!build_ww_mutex()) 461 return 0; 462 463 if (rt_waiter_node_less(&bw->tree, &aw->tree)) 464 return 0; 465 466 /* NOTE: relies on waiter->ww_ctx being set before insertion */ 467 if (aw->ww_ctx) { 468 if (!bw->ww_ctx) 469 return 1; 470 471 return (signed long)(aw->ww_ctx->stamp - 472 bw->ww_ctx->stamp) < 0; 473 } 474 475 return 0; 476 } 477 478 static __always_inline void 479 rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) 480 { 481 lockdep_assert_held(&lock->wait_lock); 482 483 rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less); 484 } 485 486 static __always_inline void 487 rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) 488 { 489 lockdep_assert_held(&lock->wait_lock); 490 491 if (RB_EMPTY_NODE(&waiter->tree.entry)) 492 return; 493 494 rb_erase_cached(&waiter->tree.entry, &lock->waiters); 495 RB_CLEAR_NODE(&waiter->tree.entry); 496 } 497 498 #define __node_2_rt_node(node) \ 499 rb_entry((node), struct rt_waiter_node, entry) 500 501 static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b) 502 { 503 return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b)); 504 } 505 506 static __always_inline void 507 rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) 508 { 509 lockdep_assert_held(&task->pi_lock); 510 511 rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less); 512 } 513 514 static __always_inline void 515 rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) 516 { 517 lockdep_assert_held(&task->pi_lock); 518 519 if (RB_EMPTY_NODE(&waiter->pi_tree.entry)) 520 return; 521 522 rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters); 523 RB_CLEAR_NODE(&waiter->pi_tree.entry); 524 } 525 526 static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock, 527 struct task_struct *p) 528 { 529 struct task_struct *pi_task = NULL; 530 531 lockdep_assert_held(&lock->wait_lock); 532 lockdep_assert(rt_mutex_owner(lock) == p); 533 lockdep_assert_held(&p->pi_lock); 534 535 if (task_has_pi_waiters(p)) 536 pi_task = task_top_pi_waiter(p)->task; 537 538 rt_mutex_setprio(p, pi_task); 539 } 540 541 /* RT mutex specific wake_q wrappers */ 542 static __always_inline void rt_mutex_wake_q_add_task(struct rt_wake_q_head *wqh, 543 struct task_struct *task, 544 unsigned int wake_state) 545 { 546 if (IS_ENABLED(CONFIG_PREEMPT_RT) && wake_state == TASK_RTLOCK_WAIT) { 547 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) 548 WARN_ON_ONCE(wqh->rtlock_task); 549 get_task_struct(task); 550 wqh->rtlock_task = task; 551 } else { 552 wake_q_add(&wqh->head, task); 553 } 554 } 555 556 static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh, 557 struct rt_mutex_waiter *w) 558 { 559 rt_mutex_wake_q_add_task(wqh, w->task, w->wake_state); 560 } 561 562 static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh) 563 { 564 if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) { 565 wake_up_state(wqh->rtlock_task, TASK_RTLOCK_WAIT); 566 put_task_struct(wqh->rtlock_task); 567 wqh->rtlock_task = NULL; 568 } 569 570 if (!wake_q_empty(&wqh->head)) 571 wake_up_q(&wqh->head); 572 573 /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */ 574 preempt_enable(); 575 } 576 577 /* 578 * Deadlock detection is conditional: 579 * 580 * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted 581 * if the detect argument is == RT_MUTEX_FULL_CHAINWALK. 582 * 583 * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always 584 * conducted independent of the detect argument. 585 * 586 * If the waiter argument is NULL this indicates the deboost path and 587 * deadlock detection is disabled independent of the detect argument 588 * and the config settings. 589 */ 590 static __always_inline bool 591 rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, 592 enum rtmutex_chainwalk chwalk) 593 { 594 if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) 595 return waiter != NULL; 596 return chwalk == RT_MUTEX_FULL_CHAINWALK; 597 } 598 599 static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_struct *p) 600 { 601 return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; 602 } 603 604 /* 605 * Adjust the priority chain. Also used for deadlock detection. 606 * Decreases task's usage by one - may thus free the task. 607 * 608 * @task: the task owning the mutex (owner) for which a chain walk is 609 * probably needed 610 * @chwalk: do we have to carry out deadlock detection? 611 * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck 612 * things for a task that has just got its priority adjusted, and 613 * is waiting on a mutex) 614 * @next_lock: the mutex on which the owner of @orig_lock was blocked before 615 * we dropped its pi_lock. Is never dereferenced, only used for 616 * comparison to detect lock chain changes. 617 * @orig_waiter: rt_mutex_waiter struct for the task that has just donated 618 * its priority to the mutex owner (can be NULL in the case 619 * depicted above or if the top waiter is gone away and we are 620 * actually deboosting the owner) 621 * @top_task: the current top waiter 622 * 623 * Returns 0 or -EDEADLK. 624 * 625 * Chain walk basics and protection scope 626 * 627 * [R] refcount on task 628 * [Pn] task->pi_lock held 629 * [L] rtmutex->wait_lock held 630 * 631 * Normal locking order: 632 * 633 * rtmutex->wait_lock 634 * task->pi_lock 635 * 636 * Step Description Protected by 637 * function arguments: 638 * @task [R] 639 * @orig_lock if != NULL @top_task is blocked on it 640 * @next_lock Unprotected. Cannot be 641 * dereferenced. Only used for 642 * comparison. 643 * @orig_waiter if != NULL @top_task is blocked on it 644 * @top_task current, or in case of proxy 645 * locking protected by calling 646 * code 647 * again: 648 * loop_sanity_check(); 649 * retry: 650 * [1] lock(task->pi_lock); [R] acquire [P1] 651 * [2] waiter = task->pi_blocked_on; [P1] 652 * [3] check_exit_conditions_1(); [P1] 653 * [4] lock = waiter->lock; [P1] 654 * [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L] 655 * unlock(task->pi_lock); release [P1] 656 * goto retry; 657 * } 658 * [6] check_exit_conditions_2(); [P1] + [L] 659 * [7] requeue_lock_waiter(lock, waiter); [P1] + [L] 660 * [8] unlock(task->pi_lock); release [P1] 661 * put_task_struct(task); release [R] 662 * [9] check_exit_conditions_3(); [L] 663 * [10] task = owner(lock); [L] 664 * get_task_struct(task); [L] acquire [R] 665 * lock(task->pi_lock); [L] acquire [P2] 666 * [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L] 667 * [12] check_exit_conditions_4(); [P2] + [L] 668 * [13] unlock(task->pi_lock); release [P2] 669 * unlock(lock->wait_lock); release [L] 670 * goto again; 671 * 672 * Where P1 is the blocking task and P2 is the lock owner; going up one step 673 * the owner becomes the next blocked task etc.. 674 * 675 * 676 */ 677 static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, 678 enum rtmutex_chainwalk chwalk, 679 struct rt_mutex_base *orig_lock, 680 struct rt_mutex_base *next_lock, 681 struct rt_mutex_waiter *orig_waiter, 682 struct task_struct *top_task) 683 { 684 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; 685 struct rt_mutex_waiter *prerequeue_top_waiter; 686 int ret = 0, depth = 0; 687 struct rt_mutex_base *lock; 688 bool detect_deadlock; 689 bool requeue = true; 690 691 detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk); 692 693 /* 694 * The (de)boosting is a step by step approach with a lot of 695 * pitfalls. We want this to be preemptible and we want hold a 696 * maximum of two locks per step. So we have to check 697 * carefully whether things change under us. 698 */ 699 again: 700 /* 701 * We limit the lock chain length for each invocation. 702 */ 703 if (++depth > max_lock_depth) { 704 static int prev_max; 705 706 /* 707 * Print this only once. If the admin changes the limit, 708 * print a new message when reaching the limit again. 709 */ 710 if (prev_max != max_lock_depth) { 711 prev_max = max_lock_depth; 712 printk(KERN_WARNING "Maximum lock depth %d reached " 713 "task: %s (%d)\n", max_lock_depth, 714 top_task->comm, task_pid_nr(top_task)); 715 } 716 put_task_struct(task); 717 718 return -EDEADLK; 719 } 720 721 /* 722 * We are fully preemptible here and only hold the refcount on 723 * @task. So everything can have changed under us since the 724 * caller or our own code below (goto retry/again) dropped all 725 * locks. 726 */ 727 retry: 728 /* 729 * [1] Task cannot go away as we did a get_task() before ! 730 */ 731 raw_spin_lock_irq(&task->pi_lock); 732 733 /* 734 * [2] Get the waiter on which @task is blocked on. 735 */ 736 waiter = task->pi_blocked_on; 737 738 /* 739 * [3] check_exit_conditions_1() protected by task->pi_lock. 740 */ 741 742 /* 743 * Check whether the end of the boosting chain has been 744 * reached or the state of the chain has changed while we 745 * dropped the locks. 746 */ 747 if (!waiter) 748 goto out_unlock_pi; 749 750 /* 751 * Check the orig_waiter state. After we dropped the locks, 752 * the previous owner of the lock might have released the lock. 753 */ 754 if (orig_waiter && !rt_mutex_owner(orig_lock)) 755 goto out_unlock_pi; 756 757 /* 758 * We dropped all locks after taking a refcount on @task, so 759 * the task might have moved on in the lock chain or even left 760 * the chain completely and blocks now on an unrelated lock or 761 * on @orig_lock. 762 * 763 * We stored the lock on which @task was blocked in @next_lock, 764 * so we can detect the chain change. 765 */ 766 if (next_lock != waiter->lock) 767 goto out_unlock_pi; 768 769 /* 770 * There could be 'spurious' loops in the lock graph due to ww_mutex, 771 * consider: 772 * 773 * P1: A, ww_A, ww_B 774 * P2: ww_B, ww_A 775 * P3: A 776 * 777 * P3 should not return -EDEADLK because it gets trapped in the cycle 778 * created by P1 and P2 (which will resolve -- and runs into 779 * max_lock_depth above). Therefore disable detect_deadlock such that 780 * the below termination condition can trigger once all relevant tasks 781 * are boosted. 782 * 783 * Even when we start with ww_mutex we can disable deadlock detection, 784 * since we would supress a ww_mutex induced deadlock at [6] anyway. 785 * Supressing it here however is not sufficient since we might still 786 * hit [6] due to adjustment driven iteration. 787 * 788 * NOTE: if someone were to create a deadlock between 2 ww_classes we'd 789 * utterly fail to report it; lockdep should. 790 */ 791 if (IS_ENABLED(CONFIG_PREEMPT_RT) && waiter->ww_ctx && detect_deadlock) 792 detect_deadlock = false; 793 794 /* 795 * Drop out, when the task has no waiters. Note, 796 * top_waiter can be NULL, when we are in the deboosting 797 * mode! 798 */ 799 if (top_waiter) { 800 if (!task_has_pi_waiters(task)) 801 goto out_unlock_pi; 802 /* 803 * If deadlock detection is off, we stop here if we 804 * are not the top pi waiter of the task. If deadlock 805 * detection is enabled we continue, but stop the 806 * requeueing in the chain walk. 807 */ 808 if (top_waiter != task_top_pi_waiter(task)) { 809 if (!detect_deadlock) 810 goto out_unlock_pi; 811 else 812 requeue = false; 813 } 814 } 815 816 /* 817 * If the waiter priority is the same as the task priority 818 * then there is no further priority adjustment necessary. If 819 * deadlock detection is off, we stop the chain walk. If its 820 * enabled we continue, but stop the requeueing in the chain 821 * walk. 822 */ 823 if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) { 824 if (!detect_deadlock) 825 goto out_unlock_pi; 826 else 827 requeue = false; 828 } 829 830 /* 831 * [4] Get the next lock; per holding task->pi_lock we can't unblock 832 * and guarantee @lock's existence. 833 */ 834 lock = waiter->lock; 835 /* 836 * [5] We need to trylock here as we are holding task->pi_lock, 837 * which is the reverse lock order versus the other rtmutex 838 * operations. 839 * 840 * Per the above, holding task->pi_lock guarantees lock exists, so 841 * inverting this lock order is infeasible from a life-time 842 * perspective. 843 */ 844 if (!raw_spin_trylock(&lock->wait_lock)) { 845 raw_spin_unlock_irq(&task->pi_lock); 846 cpu_relax(); 847 goto retry; 848 } 849 850 /* 851 * [6] check_exit_conditions_2() protected by task->pi_lock and 852 * lock->wait_lock. 853 * 854 * Deadlock detection. If the lock is the same as the original 855 * lock which caused us to walk the lock chain or if the 856 * current lock is owned by the task which initiated the chain 857 * walk, we detected a deadlock. 858 */ 859 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { 860 ret = -EDEADLK; 861 862 /* 863 * When the deadlock is due to ww_mutex; also see above. Don't 864 * report the deadlock and instead let the ww_mutex wound/die 865 * logic pick which of the contending threads gets -EDEADLK. 866 * 867 * NOTE: assumes the cycle only contains a single ww_class; any 868 * other configuration and we fail to report; also, see 869 * lockdep. 870 */ 871 if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx) 872 ret = 0; 873 874 raw_spin_unlock(&lock->wait_lock); 875 goto out_unlock_pi; 876 } 877 878 /* 879 * If we just follow the lock chain for deadlock detection, no 880 * need to do all the requeue operations. To avoid a truckload 881 * of conditionals around the various places below, just do the 882 * minimum chain walk checks. 883 */ 884 if (!requeue) { 885 /* 886 * No requeue[7] here. Just release @task [8] 887 */ 888 raw_spin_unlock(&task->pi_lock); 889 put_task_struct(task); 890 891 /* 892 * [9] check_exit_conditions_3 protected by lock->wait_lock. 893 * If there is no owner of the lock, end of chain. 894 */ 895 if (!rt_mutex_owner(lock)) { 896 raw_spin_unlock_irq(&lock->wait_lock); 897 return 0; 898 } 899 900 /* [10] Grab the next task, i.e. owner of @lock */ 901 task = get_task_struct(rt_mutex_owner(lock)); 902 raw_spin_lock(&task->pi_lock); 903 904 /* 905 * No requeue [11] here. We just do deadlock detection. 906 * 907 * [12] Store whether owner is blocked 908 * itself. Decision is made after dropping the locks 909 */ 910 next_lock = task_blocked_on_lock(task); 911 /* 912 * Get the top waiter for the next iteration 913 */ 914 top_waiter = rt_mutex_top_waiter(lock); 915 916 /* [13] Drop locks */ 917 raw_spin_unlock(&task->pi_lock); 918 raw_spin_unlock_irq(&lock->wait_lock); 919 920 /* If owner is not blocked, end of chain. */ 921 if (!next_lock) 922 goto out_put_task; 923 goto again; 924 } 925 926 /* 927 * Store the current top waiter before doing the requeue 928 * operation on @lock. We need it for the boost/deboost 929 * decision below. 930 */ 931 prerequeue_top_waiter = rt_mutex_top_waiter(lock); 932 933 /* [7] Requeue the waiter in the lock waiter tree. */ 934 rt_mutex_dequeue(lock, waiter); 935 936 /* 937 * Update the waiter prio fields now that we're dequeued. 938 * 939 * These values can have changed through either: 940 * 941 * sys_sched_set_scheduler() / sys_sched_setattr() 942 * 943 * or 944 * 945 * DL CBS enforcement advancing the effective deadline. 946 */ 947 waiter_update_prio(waiter, task); 948 949 rt_mutex_enqueue(lock, waiter); 950 951 /* 952 * [8] Release the (blocking) task in preparation for 953 * taking the owner task in [10]. 954 * 955 * Since we hold lock->waiter_lock, task cannot unblock, even if we 956 * release task->pi_lock. 957 */ 958 raw_spin_unlock(&task->pi_lock); 959 put_task_struct(task); 960 961 /* 962 * [9] check_exit_conditions_3 protected by lock->wait_lock. 963 * 964 * We must abort the chain walk if there is no lock owner even 965 * in the dead lock detection case, as we have nothing to 966 * follow here. This is the end of the chain we are walking. 967 */ 968 if (!rt_mutex_owner(lock)) { 969 /* 970 * If the requeue [7] above changed the top waiter, 971 * then we need to wake the new top waiter up to try 972 * to get the lock. 973 */ 974 top_waiter = rt_mutex_top_waiter(lock); 975 if (prerequeue_top_waiter != top_waiter) 976 wake_up_state(top_waiter->task, top_waiter->wake_state); 977 raw_spin_unlock_irq(&lock->wait_lock); 978 return 0; 979 } 980 981 /* 982 * [10] Grab the next task, i.e. the owner of @lock 983 * 984 * Per holding lock->wait_lock and checking for !owner above, there 985 * must be an owner and it cannot go away. 986 */ 987 task = get_task_struct(rt_mutex_owner(lock)); 988 raw_spin_lock(&task->pi_lock); 989 990 /* [11] requeue the pi waiters if necessary */ 991 if (waiter == rt_mutex_top_waiter(lock)) { 992 /* 993 * The waiter became the new top (highest priority) 994 * waiter on the lock. Replace the previous top waiter 995 * in the owner tasks pi waiters tree with this waiter 996 * and adjust the priority of the owner. 997 */ 998 rt_mutex_dequeue_pi(task, prerequeue_top_waiter); 999 waiter_clone_prio(waiter, task); 1000 rt_mutex_enqueue_pi(task, waiter); 1001 rt_mutex_adjust_prio(lock, task); 1002 1003 } else if (prerequeue_top_waiter == waiter) { 1004 /* 1005 * The waiter was the top waiter on the lock, but is 1006 * no longer the top priority waiter. Replace waiter in 1007 * the owner tasks pi waiters tree with the new top 1008 * (highest priority) waiter and adjust the priority 1009 * of the owner. 1010 * The new top waiter is stored in @waiter so that 1011 * @waiter == @top_waiter evaluates to true below and 1012 * we continue to deboost the rest of the chain. 1013 */ 1014 rt_mutex_dequeue_pi(task, waiter); 1015 waiter = rt_mutex_top_waiter(lock); 1016 waiter_clone_prio(waiter, task); 1017 rt_mutex_enqueue_pi(task, waiter); 1018 rt_mutex_adjust_prio(lock, task); 1019 } else { 1020 /* 1021 * Nothing changed. No need to do any priority 1022 * adjustment. 1023 */ 1024 } 1025 1026 /* 1027 * [12] check_exit_conditions_4() protected by task->pi_lock 1028 * and lock->wait_lock. The actual decisions are made after we 1029 * dropped the locks. 1030 * 1031 * Check whether the task which owns the current lock is pi 1032 * blocked itself. If yes we store a pointer to the lock for 1033 * the lock chain change detection above. After we dropped 1034 * task->pi_lock next_lock cannot be dereferenced anymore. 1035 */ 1036 next_lock = task_blocked_on_lock(task); 1037 /* 1038 * Store the top waiter of @lock for the end of chain walk 1039 * decision below. 1040 */ 1041 top_waiter = rt_mutex_top_waiter(lock); 1042 1043 /* [13] Drop the locks */ 1044 raw_spin_unlock(&task->pi_lock); 1045 raw_spin_unlock_irq(&lock->wait_lock); 1046 1047 /* 1048 * Make the actual exit decisions [12], based on the stored 1049 * values. 1050 * 1051 * We reached the end of the lock chain. Stop right here. No 1052 * point to go back just to figure that out. 1053 */ 1054 if (!next_lock) 1055 goto out_put_task; 1056 1057 /* 1058 * If the current waiter is not the top waiter on the lock, 1059 * then we can stop the chain walk here if we are not in full 1060 * deadlock detection mode. 1061 */ 1062 if (!detect_deadlock && waiter != top_waiter) 1063 goto out_put_task; 1064 1065 goto again; 1066 1067 out_unlock_pi: 1068 raw_spin_unlock_irq(&task->pi_lock); 1069 out_put_task: 1070 put_task_struct(task); 1071 1072 return ret; 1073 } 1074 1075 /* 1076 * Try to take an rt-mutex 1077 * 1078 * Must be called with lock->wait_lock held and interrupts disabled 1079 * 1080 * @lock: The lock to be acquired. 1081 * @task: The task which wants to acquire the lock 1082 * @waiter: The waiter that is queued to the lock's wait tree if the 1083 * callsite called task_blocked_on_lock(), otherwise NULL 1084 */ 1085 static int __sched 1086 try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task, 1087 struct rt_mutex_waiter *waiter) 1088 { 1089 lockdep_assert_held(&lock->wait_lock); 1090 1091 /* 1092 * Before testing whether we can acquire @lock, we set the 1093 * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all 1094 * other tasks which try to modify @lock into the slow path 1095 * and they serialize on @lock->wait_lock. 1096 * 1097 * The RT_MUTEX_HAS_WAITERS bit can have a transitional state 1098 * as explained at the top of this file if and only if: 1099 * 1100 * - There is a lock owner. The caller must fixup the 1101 * transient state if it does a trylock or leaves the lock 1102 * function due to a signal or timeout. 1103 * 1104 * - @task acquires the lock and there are no other 1105 * waiters. This is undone in rt_mutex_set_owner(@task) at 1106 * the end of this function. 1107 */ 1108 mark_rt_mutex_waiters(lock); 1109 1110 /* 1111 * If @lock has an owner, give up. 1112 */ 1113 if (rt_mutex_owner(lock)) 1114 return 0; 1115 1116 /* 1117 * If @waiter != NULL, @task has already enqueued the waiter 1118 * into @lock waiter tree. If @waiter == NULL then this is a 1119 * trylock attempt. 1120 */ 1121 if (waiter) { 1122 struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock); 1123 1124 /* 1125 * If waiter is the highest priority waiter of @lock, 1126 * or allowed to steal it, take it over. 1127 */ 1128 if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter)) { 1129 /* 1130 * We can acquire the lock. Remove the waiter from the 1131 * lock waiters tree. 1132 */ 1133 rt_mutex_dequeue(lock, waiter); 1134 } else { 1135 return 0; 1136 } 1137 } else { 1138 /* 1139 * If the lock has waiters already we check whether @task is 1140 * eligible to take over the lock. 1141 * 1142 * If there are no other waiters, @task can acquire 1143 * the lock. @task->pi_blocked_on is NULL, so it does 1144 * not need to be dequeued. 1145 */ 1146 if (rt_mutex_has_waiters(lock)) { 1147 /* Check whether the trylock can steal it. */ 1148 if (!rt_mutex_steal(task_to_waiter(task), 1149 rt_mutex_top_waiter(lock))) 1150 return 0; 1151 1152 /* 1153 * The current top waiter stays enqueued. We 1154 * don't have to change anything in the lock 1155 * waiters order. 1156 */ 1157 } else { 1158 /* 1159 * No waiters. Take the lock without the 1160 * pi_lock dance.@task->pi_blocked_on is NULL 1161 * and we have no waiters to enqueue in @task 1162 * pi waiters tree. 1163 */ 1164 goto takeit; 1165 } 1166 } 1167 1168 /* 1169 * Clear @task->pi_blocked_on. Requires protection by 1170 * @task->pi_lock. Redundant operation for the @waiter == NULL 1171 * case, but conditionals are more expensive than a redundant 1172 * store. 1173 */ 1174 raw_spin_lock(&task->pi_lock); 1175 task->pi_blocked_on = NULL; 1176 /* 1177 * Finish the lock acquisition. @task is the new owner. If 1178 * other waiters exist we have to insert the highest priority 1179 * waiter into @task->pi_waiters tree. 1180 */ 1181 if (rt_mutex_has_waiters(lock)) 1182 rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock)); 1183 raw_spin_unlock(&task->pi_lock); 1184 1185 takeit: 1186 /* 1187 * This either preserves the RT_MUTEX_HAS_WAITERS bit if there 1188 * are still waiters or clears it. 1189 */ 1190 rt_mutex_set_owner(lock, task); 1191 1192 return 1; 1193 } 1194 1195 /* 1196 * Task blocks on lock. 1197 * 1198 * Prepare waiter and propagate pi chain 1199 * 1200 * This must be called with lock->wait_lock held and interrupts disabled 1201 */ 1202 static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, 1203 struct rt_mutex_waiter *waiter, 1204 struct task_struct *task, 1205 struct ww_acquire_ctx *ww_ctx, 1206 enum rtmutex_chainwalk chwalk, 1207 struct wake_q_head *wake_q) 1208 { 1209 struct task_struct *owner = rt_mutex_owner(lock); 1210 struct rt_mutex_waiter *top_waiter = waiter; 1211 struct rt_mutex_base *next_lock; 1212 int chain_walk = 0, res; 1213 1214 lockdep_assert_held(&lock->wait_lock); 1215 1216 /* 1217 * Early deadlock detection. We really don't want the task to 1218 * enqueue on itself just to untangle the mess later. It's not 1219 * only an optimization. We drop the locks, so another waiter 1220 * can come in before the chain walk detects the deadlock. So 1221 * the other will detect the deadlock and return -EDEADLOCK, 1222 * which is wrong, as the other waiter is not in a deadlock 1223 * situation. 1224 * 1225 * Except for ww_mutex, in that case the chain walk must already deal 1226 * with spurious cycles, see the comments at [3] and [6]. 1227 */ 1228 if (owner == task && !(build_ww_mutex() && ww_ctx)) 1229 return -EDEADLK; 1230 1231 raw_spin_lock(&task->pi_lock); 1232 waiter->task = task; 1233 waiter->lock = lock; 1234 waiter_update_prio(waiter, task); 1235 waiter_clone_prio(waiter, task); 1236 1237 /* Get the top priority waiter on the lock */ 1238 if (rt_mutex_has_waiters(lock)) 1239 top_waiter = rt_mutex_top_waiter(lock); 1240 rt_mutex_enqueue(lock, waiter); 1241 1242 task->pi_blocked_on = waiter; 1243 1244 raw_spin_unlock(&task->pi_lock); 1245 1246 if (build_ww_mutex() && ww_ctx) { 1247 struct rt_mutex *rtm; 1248 1249 /* Check whether the waiter should back out immediately */ 1250 rtm = container_of(lock, struct rt_mutex, rtmutex); 1251 preempt_disable(); 1252 res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q); 1253 wake_up_q(wake_q); 1254 preempt_enable(); 1255 if (res) { 1256 raw_spin_lock(&task->pi_lock); 1257 rt_mutex_dequeue(lock, waiter); 1258 task->pi_blocked_on = NULL; 1259 raw_spin_unlock(&task->pi_lock); 1260 return res; 1261 } 1262 } 1263 1264 if (!owner) 1265 return 0; 1266 1267 raw_spin_lock(&owner->pi_lock); 1268 if (waiter == rt_mutex_top_waiter(lock)) { 1269 rt_mutex_dequeue_pi(owner, top_waiter); 1270 rt_mutex_enqueue_pi(owner, waiter); 1271 1272 rt_mutex_adjust_prio(lock, owner); 1273 if (owner->pi_blocked_on) 1274 chain_walk = 1; 1275 } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { 1276 chain_walk = 1; 1277 } 1278 1279 /* Store the lock on which owner is blocked or NULL */ 1280 next_lock = task_blocked_on_lock(owner); 1281 1282 raw_spin_unlock(&owner->pi_lock); 1283 /* 1284 * Even if full deadlock detection is on, if the owner is not 1285 * blocked itself, we can avoid finding this out in the chain 1286 * walk. 1287 */ 1288 if (!chain_walk || !next_lock) 1289 return 0; 1290 1291 /* 1292 * The owner can't disappear while holding a lock, 1293 * so the owner struct is protected by wait_lock. 1294 * Gets dropped in rt_mutex_adjust_prio_chain()! 1295 */ 1296 get_task_struct(owner); 1297 1298 raw_spin_unlock_irq(&lock->wait_lock); 1299 1300 res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, 1301 next_lock, waiter, task); 1302 1303 raw_spin_lock_irq(&lock->wait_lock); 1304 1305 return res; 1306 } 1307 1308 /* 1309 * Remove the top waiter from the current tasks pi waiter tree and 1310 * queue it up. 1311 * 1312 * Called with lock->wait_lock held and interrupts disabled. 1313 */ 1314 static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, 1315 struct rt_mutex_base *lock) 1316 { 1317 struct rt_mutex_waiter *waiter; 1318 1319 lockdep_assert_held(&lock->wait_lock); 1320 1321 raw_spin_lock(¤t->pi_lock); 1322 1323 waiter = rt_mutex_top_waiter(lock); 1324 1325 /* 1326 * Remove it from current->pi_waiters and deboost. 1327 * 1328 * We must in fact deboost here in order to ensure we call 1329 * rt_mutex_setprio() to update p->pi_top_task before the 1330 * task unblocks. 1331 */ 1332 rt_mutex_dequeue_pi(current, waiter); 1333 rt_mutex_adjust_prio(lock, current); 1334 1335 /* 1336 * As we are waking up the top waiter, and the waiter stays 1337 * queued on the lock until it gets the lock, this lock 1338 * obviously has waiters. Just set the bit here and this has 1339 * the added benefit of forcing all new tasks into the 1340 * slow path making sure no task of lower priority than 1341 * the top waiter can steal this lock. 1342 */ 1343 lock->owner = (void *) RT_MUTEX_HAS_WAITERS; 1344 1345 /* 1346 * We deboosted before waking the top waiter task such that we don't 1347 * run two tasks with the 'same' priority (and ensure the 1348 * p->pi_top_task pointer points to a blocked task). This however can 1349 * lead to priority inversion if we would get preempted after the 1350 * deboost but before waking our donor task, hence the preempt_disable() 1351 * before unlock. 1352 * 1353 * Pairs with preempt_enable() in rt_mutex_wake_up_q(); 1354 */ 1355 preempt_disable(); 1356 rt_mutex_wake_q_add(wqh, waiter); 1357 raw_spin_unlock(¤t->pi_lock); 1358 } 1359 1360 static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock) 1361 { 1362 int ret = try_to_take_rt_mutex(lock, current, NULL); 1363 1364 /* 1365 * try_to_take_rt_mutex() sets the lock waiters bit 1366 * unconditionally. Clean this up. 1367 */ 1368 fixup_rt_mutex_waiters(lock, true); 1369 1370 return ret; 1371 } 1372 1373 /* 1374 * Slow path try-lock function: 1375 */ 1376 static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock) 1377 { 1378 unsigned long flags; 1379 int ret; 1380 1381 /* 1382 * If the lock already has an owner we fail to get the lock. 1383 * This can be done without taking the @lock->wait_lock as 1384 * it is only being read, and this is a trylock anyway. 1385 */ 1386 if (rt_mutex_owner(lock)) 1387 return 0; 1388 1389 /* 1390 * The mutex has currently no owner. Lock the wait lock and try to 1391 * acquire the lock. We use irqsave here to support early boot calls. 1392 */ 1393 raw_spin_lock_irqsave(&lock->wait_lock, flags); 1394 1395 ret = __rt_mutex_slowtrylock(lock); 1396 1397 raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 1398 1399 return ret; 1400 } 1401 1402 static __always_inline int __rt_mutex_trylock(struct rt_mutex_base *lock) 1403 { 1404 if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) 1405 return 1; 1406 1407 return rt_mutex_slowtrylock(lock); 1408 } 1409 1410 /* 1411 * Slow path to release a rt-mutex. 1412 */ 1413 static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock) 1414 { 1415 DEFINE_RT_WAKE_Q(wqh); 1416 unsigned long flags; 1417 1418 /* irqsave required to support early boot calls */ 1419 raw_spin_lock_irqsave(&lock->wait_lock, flags); 1420 1421 debug_rt_mutex_unlock(lock); 1422 1423 /* 1424 * We must be careful here if the fast path is enabled. If we 1425 * have no waiters queued we cannot set owner to NULL here 1426 * because of: 1427 * 1428 * foo->lock->owner = NULL; 1429 * rtmutex_lock(foo->lock); <- fast path 1430 * free = atomic_dec_and_test(foo->refcnt); 1431 * rtmutex_unlock(foo->lock); <- fast path 1432 * if (free) 1433 * kfree(foo); 1434 * raw_spin_unlock(foo->lock->wait_lock); 1435 * 1436 * So for the fastpath enabled kernel: 1437 * 1438 * Nothing can set the waiters bit as long as we hold 1439 * lock->wait_lock. So we do the following sequence: 1440 * 1441 * owner = rt_mutex_owner(lock); 1442 * clear_rt_mutex_waiters(lock); 1443 * raw_spin_unlock(&lock->wait_lock); 1444 * if (cmpxchg(&lock->owner, owner, 0) == owner) 1445 * return; 1446 * goto retry; 1447 * 1448 * The fastpath disabled variant is simple as all access to 1449 * lock->owner is serialized by lock->wait_lock: 1450 * 1451 * lock->owner = NULL; 1452 * raw_spin_unlock(&lock->wait_lock); 1453 */ 1454 while (!rt_mutex_has_waiters(lock)) { 1455 /* Drops lock->wait_lock ! */ 1456 if (unlock_rt_mutex_safe(lock, flags) == true) 1457 return; 1458 /* Relock the rtmutex and try again */ 1459 raw_spin_lock_irqsave(&lock->wait_lock, flags); 1460 } 1461 1462 /* 1463 * The wakeup next waiter path does not suffer from the above 1464 * race. See the comments there. 1465 * 1466 * Queue the next waiter for wakeup once we release the wait_lock. 1467 */ 1468 mark_wakeup_next_waiter(&wqh, lock); 1469 raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 1470 1471 rt_mutex_wake_up_q(&wqh); 1472 } 1473 1474 static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock) 1475 { 1476 if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) 1477 return; 1478 1479 rt_mutex_slowunlock(lock); 1480 } 1481 1482 #ifdef CONFIG_SMP 1483 static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, 1484 struct rt_mutex_waiter *waiter, 1485 struct task_struct *owner) 1486 { 1487 bool res = true; 1488 1489 rcu_read_lock(); 1490 for (;;) { 1491 /* If owner changed, trylock again. */ 1492 if (owner != rt_mutex_owner(lock)) 1493 break; 1494 /* 1495 * Ensure that @owner is dereferenced after checking that 1496 * the lock owner still matches @owner. If that fails, 1497 * @owner might point to freed memory. If it still matches, 1498 * the rcu_read_lock() ensures the memory stays valid. 1499 */ 1500 barrier(); 1501 /* 1502 * Stop spinning when: 1503 * - the lock owner has been scheduled out 1504 * - current is not longer the top waiter 1505 * - current is requested to reschedule (redundant 1506 * for CONFIG_PREEMPT_RCU=y) 1507 * - the VCPU on which owner runs is preempted 1508 */ 1509 if (!owner_on_cpu(owner) || need_resched() || 1510 !rt_mutex_waiter_is_top_waiter(lock, waiter)) { 1511 res = false; 1512 break; 1513 } 1514 cpu_relax(); 1515 } 1516 rcu_read_unlock(); 1517 return res; 1518 } 1519 #else 1520 static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, 1521 struct rt_mutex_waiter *waiter, 1522 struct task_struct *owner) 1523 { 1524 return false; 1525 } 1526 #endif 1527 1528 #ifdef RT_MUTEX_BUILD_MUTEX 1529 /* 1530 * Functions required for: 1531 * - rtmutex, futex on all kernels 1532 * - mutex and rwsem substitutions on RT kernels 1533 */ 1534 1535 /* 1536 * Remove a waiter from a lock and give up 1537 * 1538 * Must be called with lock->wait_lock held and interrupts disabled. It must 1539 * have just failed to try_to_take_rt_mutex(). 1540 */ 1541 static void __sched remove_waiter(struct rt_mutex_base *lock, 1542 struct rt_mutex_waiter *waiter) 1543 { 1544 bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); 1545 struct task_struct *owner = rt_mutex_owner(lock); 1546 struct rt_mutex_base *next_lock; 1547 1548 lockdep_assert_held(&lock->wait_lock); 1549 1550 raw_spin_lock(¤t->pi_lock); 1551 rt_mutex_dequeue(lock, waiter); 1552 current->pi_blocked_on = NULL; 1553 raw_spin_unlock(¤t->pi_lock); 1554 1555 /* 1556 * Only update priority if the waiter was the highest priority 1557 * waiter of the lock and there is an owner to update. 1558 */ 1559 if (!owner || !is_top_waiter) 1560 return; 1561 1562 raw_spin_lock(&owner->pi_lock); 1563 1564 rt_mutex_dequeue_pi(owner, waiter); 1565 1566 if (rt_mutex_has_waiters(lock)) 1567 rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); 1568 1569 rt_mutex_adjust_prio(lock, owner); 1570 1571 /* Store the lock on which owner is blocked or NULL */ 1572 next_lock = task_blocked_on_lock(owner); 1573 1574 raw_spin_unlock(&owner->pi_lock); 1575 1576 /* 1577 * Don't walk the chain, if the owner task is not blocked 1578 * itself. 1579 */ 1580 if (!next_lock) 1581 return; 1582 1583 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 1584 get_task_struct(owner); 1585 1586 raw_spin_unlock_irq(&lock->wait_lock); 1587 1588 rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, 1589 next_lock, NULL, current); 1590 1591 raw_spin_lock_irq(&lock->wait_lock); 1592 } 1593 1594 /** 1595 * rt_mutex_slowlock_block() - Perform the wait-wake-try-to-take loop 1596 * @lock: the rt_mutex to take 1597 * @ww_ctx: WW mutex context pointer 1598 * @state: the state the task should block in (TASK_INTERRUPTIBLE 1599 * or TASK_UNINTERRUPTIBLE) 1600 * @timeout: the pre-initialized and started timer, or NULL for none 1601 * @waiter: the pre-initialized rt_mutex_waiter 1602 * 1603 * Must be called with lock->wait_lock held and interrupts disabled 1604 */ 1605 static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, 1606 struct ww_acquire_ctx *ww_ctx, 1607 unsigned int state, 1608 struct hrtimer_sleeper *timeout, 1609 struct rt_mutex_waiter *waiter) 1610 __releases(&lock->wait_lock) __acquires(&lock->wait_lock) 1611 { 1612 struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); 1613 struct task_struct *owner; 1614 int ret = 0; 1615 1616 for (;;) { 1617 /* Try to acquire the lock: */ 1618 if (try_to_take_rt_mutex(lock, current, waiter)) 1619 break; 1620 1621 if (timeout && !timeout->task) { 1622 ret = -ETIMEDOUT; 1623 break; 1624 } 1625 if (signal_pending_state(state, current)) { 1626 ret = -EINTR; 1627 break; 1628 } 1629 1630 if (build_ww_mutex() && ww_ctx) { 1631 ret = __ww_mutex_check_kill(rtm, waiter, ww_ctx); 1632 if (ret) 1633 break; 1634 } 1635 1636 if (waiter == rt_mutex_top_waiter(lock)) 1637 owner = rt_mutex_owner(lock); 1638 else 1639 owner = NULL; 1640 raw_spin_unlock_irq(&lock->wait_lock); 1641 1642 if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) 1643 rt_mutex_schedule(); 1644 1645 raw_spin_lock_irq(&lock->wait_lock); 1646 set_current_state(state); 1647 } 1648 1649 __set_current_state(TASK_RUNNING); 1650 return ret; 1651 } 1652 1653 static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, 1654 struct rt_mutex_base *lock, 1655 struct rt_mutex_waiter *w) 1656 { 1657 /* 1658 * If the result is not -EDEADLOCK or the caller requested 1659 * deadlock detection, nothing to do here. 1660 */ 1661 if (res != -EDEADLOCK || detect_deadlock) 1662 return; 1663 1664 if (build_ww_mutex() && w->ww_ctx) 1665 return; 1666 1667 raw_spin_unlock_irq(&lock->wait_lock); 1668 1669 WARN(1, "rtmutex deadlock detected\n"); 1670 1671 while (1) { 1672 set_current_state(TASK_INTERRUPTIBLE); 1673 rt_mutex_schedule(); 1674 } 1675 } 1676 1677 /** 1678 * __rt_mutex_slowlock - Locking slowpath invoked with lock::wait_lock held 1679 * @lock: The rtmutex to block lock 1680 * @ww_ctx: WW mutex context pointer 1681 * @state: The task state for sleeping 1682 * @chwalk: Indicator whether full or partial chainwalk is requested 1683 * @waiter: Initializer waiter for blocking 1684 * @wake_q: The wake_q to wake tasks after we release the wait_lock 1685 */ 1686 static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, 1687 struct ww_acquire_ctx *ww_ctx, 1688 unsigned int state, 1689 enum rtmutex_chainwalk chwalk, 1690 struct rt_mutex_waiter *waiter, 1691 struct wake_q_head *wake_q) 1692 { 1693 struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); 1694 struct ww_mutex *ww = ww_container_of(rtm); 1695 int ret; 1696 1697 lockdep_assert_held(&lock->wait_lock); 1698 1699 /* Try to acquire the lock again: */ 1700 if (try_to_take_rt_mutex(lock, current, NULL)) { 1701 if (build_ww_mutex() && ww_ctx) { 1702 __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); 1703 ww_mutex_lock_acquired(ww, ww_ctx); 1704 } 1705 return 0; 1706 } 1707 1708 set_current_state(state); 1709 1710 trace_contention_begin(lock, LCB_F_RT); 1711 1712 ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk, wake_q); 1713 if (likely(!ret)) 1714 ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter); 1715 1716 if (likely(!ret)) { 1717 /* acquired the lock */ 1718 if (build_ww_mutex() && ww_ctx) { 1719 if (!ww_ctx->is_wait_die) 1720 __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); 1721 ww_mutex_lock_acquired(ww, ww_ctx); 1722 } 1723 } else { 1724 __set_current_state(TASK_RUNNING); 1725 remove_waiter(lock, waiter); 1726 rt_mutex_handle_deadlock(ret, chwalk, lock, waiter); 1727 } 1728 1729 /* 1730 * try_to_take_rt_mutex() sets the waiter bit 1731 * unconditionally. We might have to fix that up. 1732 */ 1733 fixup_rt_mutex_waiters(lock, true); 1734 1735 trace_contention_end(lock, ret); 1736 1737 return ret; 1738 } 1739 1740 static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, 1741 struct ww_acquire_ctx *ww_ctx, 1742 unsigned int state, 1743 struct wake_q_head *wake_q) 1744 { 1745 struct rt_mutex_waiter waiter; 1746 int ret; 1747 1748 rt_mutex_init_waiter(&waiter); 1749 waiter.ww_ctx = ww_ctx; 1750 1751 ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK, 1752 &waiter, wake_q); 1753 1754 debug_rt_mutex_free_waiter(&waiter); 1755 return ret; 1756 } 1757 1758 /* 1759 * rt_mutex_slowlock - Locking slowpath invoked when fast path fails 1760 * @lock: The rtmutex to block lock 1761 * @ww_ctx: WW mutex context pointer 1762 * @state: The task state for sleeping 1763 */ 1764 static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, 1765 struct ww_acquire_ctx *ww_ctx, 1766 unsigned int state) 1767 { 1768 DEFINE_WAKE_Q(wake_q); 1769 unsigned long flags; 1770 int ret; 1771 1772 /* 1773 * Do all pre-schedule work here, before we queue a waiter and invoke 1774 * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would 1775 * otherwise recurse back into task_blocks_on_rt_mutex() through 1776 * rtlock_slowlock() and will then enqueue a second waiter for this 1777 * same task and things get really confusing real fast. 1778 */ 1779 rt_mutex_pre_schedule(); 1780 1781 /* 1782 * Technically we could use raw_spin_[un]lock_irq() here, but this can 1783 * be called in early boot if the cmpxchg() fast path is disabled 1784 * (debug, no architecture support). In this case we will acquire the 1785 * rtmutex with lock->wait_lock held. But we cannot unconditionally 1786 * enable interrupts in that early boot case. So we need to use the 1787 * irqsave/restore variants. 1788 */ 1789 raw_spin_lock_irqsave(&lock->wait_lock, flags); 1790 ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state, &wake_q); 1791 preempt_disable(); 1792 raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 1793 wake_up_q(&wake_q); 1794 preempt_enable(); 1795 rt_mutex_post_schedule(); 1796 1797 return ret; 1798 } 1799 1800 static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, 1801 unsigned int state) 1802 { 1803 lockdep_assert(!current->pi_blocked_on); 1804 1805 if (likely(rt_mutex_try_acquire(lock))) 1806 return 0; 1807 1808 return rt_mutex_slowlock(lock, NULL, state); 1809 } 1810 #endif /* RT_MUTEX_BUILD_MUTEX */ 1811 1812 #ifdef RT_MUTEX_BUILD_SPINLOCKS 1813 /* 1814 * Functions required for spin/rw_lock substitution on RT kernels 1815 */ 1816 1817 /** 1818 * rtlock_slowlock_locked - Slow path lock acquisition for RT locks 1819 * @lock: The underlying RT mutex 1820 * @wake_q: The wake_q to wake tasks after we release the wait_lock 1821 */ 1822 static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, 1823 struct wake_q_head *wake_q) 1824 __releases(&lock->wait_lock) __acquires(&lock->wait_lock) 1825 { 1826 struct rt_mutex_waiter waiter; 1827 struct task_struct *owner; 1828 1829 lockdep_assert_held(&lock->wait_lock); 1830 1831 if (try_to_take_rt_mutex(lock, current, NULL)) 1832 return; 1833 1834 rt_mutex_init_rtlock_waiter(&waiter); 1835 1836 /* Save current state and set state to TASK_RTLOCK_WAIT */ 1837 current_save_and_set_rtlock_wait_state(); 1838 1839 trace_contention_begin(lock, LCB_F_RT); 1840 1841 task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK, wake_q); 1842 1843 for (;;) { 1844 /* Try to acquire the lock again */ 1845 if (try_to_take_rt_mutex(lock, current, &waiter)) 1846 break; 1847 1848 if (&waiter == rt_mutex_top_waiter(lock)) 1849 owner = rt_mutex_owner(lock); 1850 else 1851 owner = NULL; 1852 preempt_disable(); 1853 raw_spin_unlock_irq(&lock->wait_lock); 1854 wake_up_q(wake_q); 1855 wake_q_init(wake_q); 1856 preempt_enable(); 1857 1858 if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) 1859 schedule_rtlock(); 1860 1861 raw_spin_lock_irq(&lock->wait_lock); 1862 set_current_state(TASK_RTLOCK_WAIT); 1863 } 1864 1865 /* Restore the task state */ 1866 current_restore_rtlock_saved_state(); 1867 1868 /* 1869 * try_to_take_rt_mutex() sets the waiter bit unconditionally. 1870 * We might have to fix that up: 1871 */ 1872 fixup_rt_mutex_waiters(lock, true); 1873 debug_rt_mutex_free_waiter(&waiter); 1874 1875 trace_contention_end(lock, 0); 1876 } 1877 1878 static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) 1879 { 1880 unsigned long flags; 1881 DEFINE_WAKE_Q(wake_q); 1882 1883 raw_spin_lock_irqsave(&lock->wait_lock, flags); 1884 rtlock_slowlock_locked(lock, &wake_q); 1885 preempt_disable(); 1886 raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 1887 wake_up_q(&wake_q); 1888 preempt_enable(); 1889 } 1890 1891 #endif /* RT_MUTEX_BUILD_SPINLOCKS */ 1892