1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * RT-Mutexes: simple blocking mutual exclusion locks with PI support 4 * 5 * started by Ingo Molnar and Thomas Gleixner. 6 * 7 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 8 * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 9 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt 10 * Copyright (C) 2006 Esben Nielsen 11 * Adaptive Spinlocks: 12 * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich, 13 * and Peter Morreale, 14 * Adaptive Spinlocks simplification: 15 * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com> 16 * 17 * See Documentation/locking/rt-mutex-design.rst for details. 18 */ 19 #include <linux/sched.h> 20 #include <linux/sched/debug.h> 21 #include <linux/sched/deadline.h> 22 #include <linux/sched/signal.h> 23 #include <linux/sched/rt.h> 24 #include <linux/sched/wake_q.h> 25 #include <linux/ww_mutex.h> 26 27 #include <trace/events/lock.h> 28 29 #include "rtmutex_common.h" 30 #include "lock_events.h" 31 32 #ifndef WW_RT 33 # define build_ww_mutex() (false) 34 # define ww_container_of(rtm) NULL 35 36 static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter, 37 struct rt_mutex *lock, 38 struct ww_acquire_ctx *ww_ctx, 39 struct wake_q_head *wake_q) 40 { 41 return 0; 42 } 43 44 static inline void __ww_mutex_check_waiters(struct rt_mutex *lock, 45 struct ww_acquire_ctx *ww_ctx, 46 struct wake_q_head *wake_q) 47 { 48 } 49 50 static inline void ww_mutex_lock_acquired(struct ww_mutex *lock, 51 struct ww_acquire_ctx *ww_ctx) 52 { 53 } 54 55 static inline int __ww_mutex_check_kill(struct rt_mutex *lock, 56 struct rt_mutex_waiter *waiter, 57 struct ww_acquire_ctx *ww_ctx) 58 { 59 return 0; 60 } 61 62 #else 63 # define build_ww_mutex() (true) 64 # define ww_container_of(rtm) container_of(rtm, struct ww_mutex, base) 65 # include "ww_mutex.h" 66 #endif 67 68 /* 69 * lock->owner state tracking: 70 * 71 * lock->owner holds the task_struct pointer of the owner. Bit 0 72 * is used to keep track of the "lock has waiters" state. 73 * 74 * owner bit0 75 * NULL 0 lock is free (fast acquire possible) 76 * NULL 1 lock is free and has waiters and the top waiter 77 * is going to take the lock* 78 * taskpointer 0 lock is held (fast release possible) 79 * taskpointer 1 lock is held and has waiters** 80 * 81 * The fast atomic compare exchange based acquire and release is only 82 * possible when bit 0 of lock->owner is 0. 83 * 84 * (*) It also can be a transitional state when grabbing the lock 85 * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock, 86 * we need to set the bit0 before looking at the lock, and the owner may be 87 * NULL in this small time, hence this can be a transitional state. 88 * 89 * (**) There is a small time when bit 0 is set but there are no 90 * waiters. This can happen when grabbing the lock in the slow path. 91 * To prevent a cmpxchg of the owner releasing the lock, we need to 92 * set this bit before looking at the lock. 93 */ 94 95 static __always_inline struct task_struct * 96 rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner) 97 __must_hold(&lock->wait_lock) 98 { 99 unsigned long val = (unsigned long)owner; 100 101 if (rt_mutex_has_waiters(lock)) 102 val |= RT_MUTEX_HAS_WAITERS; 103 104 return (struct task_struct *)val; 105 } 106 107 static __always_inline void 108 rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) 109 __must_hold(&lock->wait_lock) 110 { 111 /* 112 * lock->wait_lock is held but explicit acquire semantics are needed 113 * for a new lock owner so WRITE_ONCE is insufficient. 114 */ 115 xchg_acquire(&lock->owner, rt_mutex_owner_encode(lock, owner)); 116 } 117 118 static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock) 119 __must_hold(&lock->wait_lock) 120 { 121 /* lock->wait_lock is held so the unlock provides release semantics. */ 122 WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL)); 123 } 124 125 static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) 126 __must_hold(&lock->wait_lock) 127 { 128 lock->owner = (struct task_struct *) 129 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); 130 } 131 132 static __always_inline void 133 fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock) 134 __must_hold(&lock->wait_lock) 135 { 136 unsigned long owner, *p = (unsigned long *) &lock->owner; 137 138 if (rt_mutex_has_waiters(lock)) 139 return; 140 141 /* 142 * The rbtree has no waiters enqueued, now make sure that the 143 * lock->owner still has the waiters bit set, otherwise the 144 * following can happen: 145 * 146 * CPU 0 CPU 1 CPU2 147 * l->owner=T1 148 * rt_mutex_lock(l) 149 * lock(l->lock) 150 * l->owner = T1 | HAS_WAITERS; 151 * enqueue(T2) 152 * boost() 153 * unlock(l->lock) 154 * block() 155 * 156 * rt_mutex_lock(l) 157 * lock(l->lock) 158 * l->owner = T1 | HAS_WAITERS; 159 * enqueue(T3) 160 * boost() 161 * unlock(l->lock) 162 * block() 163 * signal(->T2) signal(->T3) 164 * lock(l->lock) 165 * dequeue(T2) 166 * deboost() 167 * unlock(l->lock) 168 * lock(l->lock) 169 * dequeue(T3) 170 * ==> wait list is empty 171 * deboost() 172 * unlock(l->lock) 173 * lock(l->lock) 174 * fixup_rt_mutex_waiters() 175 * if (wait_list_empty(l) { 176 * l->owner = owner 177 * owner = l->owner & ~HAS_WAITERS; 178 * ==> l->owner = T1 179 * } 180 * lock(l->lock) 181 * rt_mutex_unlock(l) fixup_rt_mutex_waiters() 182 * if (wait_list_empty(l) { 183 * owner = l->owner & ~HAS_WAITERS; 184 * cmpxchg(l->owner, T1, NULL) 185 * ===> Success (l->owner = NULL) 186 * 187 * l->owner = owner 188 * ==> l->owner = T1 189 * } 190 * 191 * With the check for the waiter bit in place T3 on CPU2 will not 192 * overwrite. All tasks fiddling with the waiters bit are 193 * serialized by l->lock, so nothing else can modify the waiters 194 * bit. If the bit is set then nothing can change l->owner either 195 * so the simple RMW is safe. The cmpxchg() will simply fail if it 196 * happens in the middle of the RMW because the waiters bit is 197 * still set. 198 */ 199 owner = READ_ONCE(*p); 200 if (owner & RT_MUTEX_HAS_WAITERS) { 201 /* 202 * See rt_mutex_set_owner() and rt_mutex_clear_owner() on 203 * why xchg_acquire() is used for updating owner for 204 * locking and WRITE_ONCE() for unlocking. 205 * 206 * WRITE_ONCE() would work for the acquire case too, but 207 * in case that the lock acquisition failed it might 208 * force other lockers into the slow path unnecessarily. 209 */ 210 if (acquire_lock) 211 xchg_acquire(p, owner & ~RT_MUTEX_HAS_WAITERS); 212 else 213 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); 214 } 215 } 216 217 /* 218 * We can speed up the acquire/release, if there's no debugging state to be 219 * set up. 220 */ 221 #ifndef CONFIG_DEBUG_RT_MUTEXES 222 static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, 223 struct task_struct *old, 224 struct task_struct *new) 225 { 226 return try_cmpxchg_acquire(&lock->owner, &old, new); 227 } 228 229 static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock) 230 { 231 return rt_mutex_cmpxchg_acquire(lock, NULL, current); 232 } 233 234 static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, 235 struct task_struct *old, 236 struct task_struct *new) 237 { 238 return try_cmpxchg_release(&lock->owner, &old, new); 239 } 240 241 /* 242 * Callers must hold the ->wait_lock -- which is the whole purpose as we force 243 * all future threads that attempt to [Rmw] the lock to the slowpath. As such 244 * relaxed semantics suffice. 245 */ 246 static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) 247 { 248 unsigned long *p = (unsigned long *) &lock->owner; 249 unsigned long owner, new; 250 251 owner = READ_ONCE(*p); 252 do { 253 new = owner | RT_MUTEX_HAS_WAITERS; 254 } while (!try_cmpxchg_relaxed(p, &owner, new)); 255 256 /* 257 * The cmpxchg loop above is relaxed to avoid back-to-back ACQUIRE 258 * operations in the event of contention. Ensure the successful 259 * cmpxchg is visible. 260 */ 261 smp_mb__after_atomic(); 262 } 263 264 /* 265 * Safe fastpath aware unlock: 266 * 1) Clear the waiters bit 267 * 2) Drop lock->wait_lock 268 * 3) Try to unlock the lock with cmpxchg 269 */ 270 static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock, 271 unsigned long flags) 272 __releases(lock->wait_lock) 273 { 274 struct task_struct *owner = rt_mutex_owner(lock); 275 276 clear_rt_mutex_waiters(lock); 277 raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 278 /* 279 * If a new waiter comes in between the unlock and the cmpxchg 280 * we have two situations: 281 * 282 * unlock(wait_lock); 283 * lock(wait_lock); 284 * cmpxchg(p, owner, 0) == owner 285 * mark_rt_mutex_waiters(lock); 286 * acquire(lock); 287 * or: 288 * 289 * unlock(wait_lock); 290 * lock(wait_lock); 291 * mark_rt_mutex_waiters(lock); 292 * 293 * cmpxchg(p, owner, 0) != owner 294 * enqueue_waiter(); 295 * unlock(wait_lock); 296 * lock(wait_lock); 297 * wake waiter(); 298 * unlock(wait_lock); 299 * lock(wait_lock); 300 * acquire(lock); 301 */ 302 return rt_mutex_cmpxchg_release(lock, owner, NULL); 303 } 304 305 #else 306 static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, 307 struct task_struct *old, 308 struct task_struct *new) 309 { 310 return false; 311 312 } 313 314 static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock); 315 316 static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock) 317 { 318 /* 319 * With debug enabled rt_mutex_cmpxchg trylock() will always fail. 320 * 321 * Avoid unconditionally taking the slow path by using 322 * rt_mutex_slow_trylock() which is covered by the debug code and can 323 * acquire a non-contended rtmutex. 324 */ 325 return rt_mutex_slowtrylock(lock); 326 } 327 328 static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, 329 struct task_struct *old, 330 struct task_struct *new) 331 { 332 return false; 333 } 334 335 static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) 336 __must_hold(&lock->wait_lock) 337 { 338 lock->owner = (struct task_struct *) 339 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); 340 } 341 342 /* 343 * Simple slow path only version: lock->owner is protected by lock->wait_lock. 344 */ 345 static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock, 346 unsigned long flags) 347 __releases(lock->wait_lock) 348 { 349 lock->owner = NULL; 350 raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 351 return true; 352 } 353 #endif 354 355 static __always_inline int __waiter_prio(struct task_struct *task) 356 { 357 int prio = task->prio; 358 359 if (!rt_or_dl_prio(prio)) 360 return DEFAULT_PRIO; 361 362 return prio; 363 } 364 365 /* 366 * Update the waiter->tree copy of the sort keys. 367 */ 368 static __always_inline void 369 waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) 370 { 371 lockdep_assert_held(&waiter->lock->wait_lock); 372 lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry)); 373 374 waiter->tree.prio = __waiter_prio(task); 375 waiter->tree.deadline = task->dl.deadline; 376 } 377 378 /* 379 * Update the waiter->pi_tree copy of the sort keys (from the tree copy). 380 */ 381 static __always_inline void 382 waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) 383 { 384 lockdep_assert_held(&waiter->lock->wait_lock); 385 lockdep_assert_held(&task->pi_lock); 386 lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry)); 387 388 waiter->pi_tree.prio = waiter->tree.prio; 389 waiter->pi_tree.deadline = waiter->tree.deadline; 390 } 391 392 /* 393 * Only use with rt_waiter_node_{less,equal}() 394 */ 395 #define task_to_waiter_node(p) \ 396 &(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } 397 #define task_to_waiter(p) \ 398 &(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) } 399 400 static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left, 401 struct rt_waiter_node *right) 402 { 403 if (left->prio < right->prio) 404 return 1; 405 406 /* 407 * If both waiters have dl_prio(), we check the deadlines of the 408 * associated tasks. 409 * If left waiter has a dl_prio(), and we didn't return 1 above, 410 * then right waiter has a dl_prio() too. 411 */ 412 if (dl_prio(left->prio)) 413 return dl_time_before(left->deadline, right->deadline); 414 415 return 0; 416 } 417 418 static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left, 419 struct rt_waiter_node *right) 420 { 421 if (left->prio != right->prio) 422 return 0; 423 424 /* 425 * If both waiters have dl_prio(), we check the deadlines of the 426 * associated tasks. 427 * If left waiter has a dl_prio(), and we didn't return 0 above, 428 * then right waiter has a dl_prio() too. 429 */ 430 if (dl_prio(left->prio)) 431 return left->deadline == right->deadline; 432 433 return 1; 434 } 435 436 static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, 437 struct rt_mutex_waiter *top_waiter) 438 { 439 if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree)) 440 return true; 441 442 #ifdef RT_MUTEX_BUILD_SPINLOCKS 443 /* 444 * Note that RT tasks are excluded from same priority (lateral) 445 * steals to prevent the introduction of an unbounded latency. 446 */ 447 if (rt_or_dl_prio(waiter->tree.prio)) 448 return false; 449 450 return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree); 451 #else 452 return false; 453 #endif 454 } 455 456 #define __node_2_waiter(node) \ 457 rb_entry((node), struct rt_mutex_waiter, tree.entry) 458 459 static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b) 460 { 461 struct rt_mutex_waiter *aw = __node_2_waiter(a); 462 struct rt_mutex_waiter *bw = __node_2_waiter(b); 463 464 if (rt_waiter_node_less(&aw->tree, &bw->tree)) 465 return 1; 466 467 if (!build_ww_mutex()) 468 return 0; 469 470 if (rt_waiter_node_less(&bw->tree, &aw->tree)) 471 return 0; 472 473 /* NOTE: relies on waiter->ww_ctx being set before insertion */ 474 if (aw->ww_ctx) { 475 if (!bw->ww_ctx) 476 return 1; 477 478 return (signed long)(aw->ww_ctx->stamp - 479 bw->ww_ctx->stamp) < 0; 480 } 481 482 return 0; 483 } 484 485 static __always_inline void 486 rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) 487 __must_hold(&lock->wait_lock) 488 { 489 lockdep_assert_held(&lock->wait_lock); 490 491 rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less); 492 } 493 494 static __always_inline void 495 rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) 496 __must_hold(&lock->wait_lock) 497 { 498 lockdep_assert_held(&lock->wait_lock); 499 500 if (RB_EMPTY_NODE(&waiter->tree.entry)) 501 return; 502 503 rb_erase_cached(&waiter->tree.entry, &lock->waiters); 504 RB_CLEAR_NODE(&waiter->tree.entry); 505 } 506 507 #define __node_2_rt_node(node) \ 508 rb_entry((node), struct rt_waiter_node, entry) 509 510 static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b) 511 { 512 return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b)); 513 } 514 515 static __always_inline void 516 rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) 517 { 518 lockdep_assert_held(&task->pi_lock); 519 520 rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less); 521 } 522 523 static __always_inline void 524 rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) 525 { 526 lockdep_assert_held(&task->pi_lock); 527 528 if (RB_EMPTY_NODE(&waiter->pi_tree.entry)) 529 return; 530 531 rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters); 532 RB_CLEAR_NODE(&waiter->pi_tree.entry); 533 } 534 535 static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock, 536 struct task_struct *p) 537 { 538 struct task_struct *pi_task = NULL; 539 540 lockdep_assert_held(&lock->wait_lock); 541 lockdep_assert(rt_mutex_owner(lock) == p); 542 lockdep_assert_held(&p->pi_lock); 543 544 if (task_has_pi_waiters(p)) 545 pi_task = task_top_pi_waiter(p)->task; 546 547 rt_mutex_setprio(p, pi_task); 548 } 549 550 /* RT mutex specific wake_q wrappers */ 551 static __always_inline void rt_mutex_wake_q_add_task(struct rt_wake_q_head *wqh, 552 struct task_struct *task, 553 unsigned int wake_state) 554 { 555 if (IS_ENABLED(CONFIG_PREEMPT_RT) && wake_state == TASK_RTLOCK_WAIT) { 556 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) 557 WARN_ON_ONCE(wqh->rtlock_task); 558 get_task_struct(task); 559 wqh->rtlock_task = task; 560 } else { 561 wake_q_add(&wqh->head, task); 562 } 563 } 564 565 static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh, 566 struct rt_mutex_waiter *w) 567 { 568 rt_mutex_wake_q_add_task(wqh, w->task, w->wake_state); 569 } 570 571 static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh) 572 { 573 if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) { 574 wake_up_state(wqh->rtlock_task, TASK_RTLOCK_WAIT); 575 put_task_struct(wqh->rtlock_task); 576 wqh->rtlock_task = NULL; 577 } 578 579 if (!wake_q_empty(&wqh->head)) 580 wake_up_q(&wqh->head); 581 582 /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */ 583 preempt_enable(); 584 } 585 586 /* 587 * Deadlock detection is conditional: 588 * 589 * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted 590 * if the detect argument is == RT_MUTEX_FULL_CHAINWALK. 591 * 592 * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always 593 * conducted independent of the detect argument. 594 * 595 * If the waiter argument is NULL this indicates the deboost path and 596 * deadlock detection is disabled independent of the detect argument 597 * and the config settings. 598 */ 599 static __always_inline bool 600 rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, 601 enum rtmutex_chainwalk chwalk) 602 { 603 if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) 604 return waiter != NULL; 605 return chwalk == RT_MUTEX_FULL_CHAINWALK; 606 } 607 608 static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_struct *p) 609 { 610 return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; 611 } 612 613 /* 614 * Adjust the priority chain. Also used for deadlock detection. 615 * Decreases task's usage by one - may thus free the task. 616 * 617 * @task: the task owning the mutex (owner) for which a chain walk is 618 * probably needed 619 * @chwalk: do we have to carry out deadlock detection? 620 * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck 621 * things for a task that has just got its priority adjusted, and 622 * is waiting on a mutex) 623 * @next_lock: the mutex on which the owner of @orig_lock was blocked before 624 * we dropped its pi_lock. Is never dereferenced, only used for 625 * comparison to detect lock chain changes. 626 * @orig_waiter: rt_mutex_waiter struct for the task that has just donated 627 * its priority to the mutex owner (can be NULL in the case 628 * depicted above or if the top waiter is gone away and we are 629 * actually deboosting the owner) 630 * @top_task: the current top waiter 631 * 632 * Returns 0 or -EDEADLK. 633 * 634 * Chain walk basics and protection scope 635 * 636 * [R] refcount on task 637 * [Pn] task->pi_lock held 638 * [L] rtmutex->wait_lock held 639 * 640 * Normal locking order: 641 * 642 * rtmutex->wait_lock 643 * task->pi_lock 644 * 645 * Step Description Protected by 646 * function arguments: 647 * @task [R] 648 * @orig_lock if != NULL @top_task is blocked on it 649 * @next_lock Unprotected. Cannot be 650 * dereferenced. Only used for 651 * comparison. 652 * @orig_waiter if != NULL @top_task is blocked on it 653 * @top_task current, or in case of proxy 654 * locking protected by calling 655 * code 656 * again: 657 * loop_sanity_check(); 658 * retry: 659 * [1] lock(task->pi_lock); [R] acquire [P1] 660 * [2] waiter = task->pi_blocked_on; [P1] 661 * [3] check_exit_conditions_1(); [P1] 662 * [4] lock = waiter->lock; [P1] 663 * [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L] 664 * unlock(task->pi_lock); release [P1] 665 * goto retry; 666 * } 667 * [6] check_exit_conditions_2(); [P1] + [L] 668 * [7] requeue_lock_waiter(lock, waiter); [P1] + [L] 669 * [8] unlock(task->pi_lock); release [P1] 670 * put_task_struct(task); release [R] 671 * [9] check_exit_conditions_3(); [L] 672 * [10] task = owner(lock); [L] 673 * get_task_struct(task); [L] acquire [R] 674 * lock(task->pi_lock); [L] acquire [P2] 675 * [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L] 676 * [12] check_exit_conditions_4(); [P2] + [L] 677 * [13] unlock(task->pi_lock); release [P2] 678 * unlock(lock->wait_lock); release [L] 679 * goto again; 680 * 681 * Where P1 is the blocking task and P2 is the lock owner; going up one step 682 * the owner becomes the next blocked task etc.. 683 * 684 * 685 */ 686 static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, 687 enum rtmutex_chainwalk chwalk, 688 struct rt_mutex_base *orig_lock, 689 struct rt_mutex_base *next_lock, 690 struct rt_mutex_waiter *orig_waiter, 691 struct task_struct *top_task) 692 { 693 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; 694 struct rt_mutex_waiter *prerequeue_top_waiter; 695 int ret = 0, depth = 0; 696 struct rt_mutex_base *lock; 697 bool detect_deadlock; 698 bool requeue = true; 699 700 detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk); 701 702 /* 703 * The (de)boosting is a step by step approach with a lot of 704 * pitfalls. We want this to be preemptible and we want hold a 705 * maximum of two locks per step. So we have to check 706 * carefully whether things change under us. 707 */ 708 again: 709 /* 710 * We limit the lock chain length for each invocation. 711 */ 712 if (++depth > max_lock_depth) { 713 static int prev_max; 714 715 /* 716 * Print this only once. If the admin changes the limit, 717 * print a new message when reaching the limit again. 718 */ 719 if (prev_max != max_lock_depth) { 720 prev_max = max_lock_depth; 721 printk(KERN_WARNING "Maximum lock depth %d reached " 722 "task: %s (%d)\n", max_lock_depth, 723 top_task->comm, task_pid_nr(top_task)); 724 } 725 put_task_struct(task); 726 727 return -EDEADLK; 728 } 729 730 /* 731 * We are fully preemptible here and only hold the refcount on 732 * @task. So everything can have changed under us since the 733 * caller or our own code below (goto retry/again) dropped all 734 * locks. 735 */ 736 retry: 737 /* 738 * [1] Task cannot go away as we did a get_task() before ! 739 */ 740 raw_spin_lock_irq(&task->pi_lock); 741 742 /* 743 * [2] Get the waiter on which @task is blocked on. 744 */ 745 waiter = task->pi_blocked_on; 746 747 /* 748 * [3] check_exit_conditions_1() protected by task->pi_lock. 749 */ 750 751 /* 752 * Check whether the end of the boosting chain has been 753 * reached or the state of the chain has changed while we 754 * dropped the locks. 755 */ 756 if (!waiter) 757 goto out_unlock_pi; 758 759 /* 760 * Check the orig_waiter state. After we dropped the locks, 761 * the previous owner of the lock might have released the lock. 762 */ 763 if (orig_waiter && !rt_mutex_owner(orig_lock)) 764 goto out_unlock_pi; 765 766 /* 767 * We dropped all locks after taking a refcount on @task, so 768 * the task might have moved on in the lock chain or even left 769 * the chain completely and blocks now on an unrelated lock or 770 * on @orig_lock. 771 * 772 * We stored the lock on which @task was blocked in @next_lock, 773 * so we can detect the chain change. 774 */ 775 if (next_lock != waiter->lock) 776 goto out_unlock_pi; 777 778 /* 779 * There could be 'spurious' loops in the lock graph due to ww_mutex, 780 * consider: 781 * 782 * P1: A, ww_A, ww_B 783 * P2: ww_B, ww_A 784 * P3: A 785 * 786 * P3 should not return -EDEADLK because it gets trapped in the cycle 787 * created by P1 and P2 (which will resolve -- and runs into 788 * max_lock_depth above). Therefore disable detect_deadlock such that 789 * the below termination condition can trigger once all relevant tasks 790 * are boosted. 791 * 792 * Even when we start with ww_mutex we can disable deadlock detection, 793 * since we would supress a ww_mutex induced deadlock at [6] anyway. 794 * Supressing it here however is not sufficient since we might still 795 * hit [6] due to adjustment driven iteration. 796 * 797 * NOTE: if someone were to create a deadlock between 2 ww_classes we'd 798 * utterly fail to report it; lockdep should. 799 */ 800 if (IS_ENABLED(CONFIG_PREEMPT_RT) && waiter->ww_ctx && detect_deadlock) 801 detect_deadlock = false; 802 803 /* 804 * Drop out, when the task has no waiters. Note, 805 * top_waiter can be NULL, when we are in the deboosting 806 * mode! 807 */ 808 if (top_waiter) { 809 if (!task_has_pi_waiters(task)) 810 goto out_unlock_pi; 811 /* 812 * If deadlock detection is off, we stop here if we 813 * are not the top pi waiter of the task. If deadlock 814 * detection is enabled we continue, but stop the 815 * requeueing in the chain walk. 816 */ 817 if (top_waiter != task_top_pi_waiter(task)) { 818 if (!detect_deadlock) 819 goto out_unlock_pi; 820 else 821 requeue = false; 822 } 823 } 824 825 /* 826 * If the waiter priority is the same as the task priority 827 * then there is no further priority adjustment necessary. If 828 * deadlock detection is off, we stop the chain walk. If its 829 * enabled we continue, but stop the requeueing in the chain 830 * walk. 831 */ 832 if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) { 833 if (!detect_deadlock) 834 goto out_unlock_pi; 835 else 836 requeue = false; 837 } 838 839 /* 840 * [4] Get the next lock; per holding task->pi_lock we can't unblock 841 * and guarantee @lock's existence. 842 */ 843 lock = waiter->lock; 844 /* 845 * [5] We need to trylock here as we are holding task->pi_lock, 846 * which is the reverse lock order versus the other rtmutex 847 * operations. 848 * 849 * Per the above, holding task->pi_lock guarantees lock exists, so 850 * inverting this lock order is infeasible from a life-time 851 * perspective. 852 */ 853 if (!raw_spin_trylock(&lock->wait_lock)) { 854 raw_spin_unlock_irq(&task->pi_lock); 855 cpu_relax(); 856 goto retry; 857 } 858 859 /* 860 * [6] check_exit_conditions_2() protected by task->pi_lock and 861 * lock->wait_lock. 862 * 863 * Deadlock detection. If the lock is the same as the original 864 * lock which caused us to walk the lock chain or if the 865 * current lock is owned by the task which initiated the chain 866 * walk, we detected a deadlock. 867 */ 868 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { 869 ret = -EDEADLK; 870 871 /* 872 * When the deadlock is due to ww_mutex; also see above. Don't 873 * report the deadlock and instead let the ww_mutex wound/die 874 * logic pick which of the contending threads gets -EDEADLK. 875 * 876 * NOTE: assumes the cycle only contains a single ww_class; any 877 * other configuration and we fail to report; also, see 878 * lockdep. 879 */ 880 if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx) 881 ret = 0; 882 883 raw_spin_unlock(&lock->wait_lock); 884 goto out_unlock_pi; 885 } 886 887 /* 888 * If we just follow the lock chain for deadlock detection, no 889 * need to do all the requeue operations. To avoid a truckload 890 * of conditionals around the various places below, just do the 891 * minimum chain walk checks. 892 */ 893 if (!requeue) { 894 /* 895 * No requeue[7] here. Just release @task [8] 896 */ 897 raw_spin_unlock(&task->pi_lock); 898 put_task_struct(task); 899 900 /* 901 * [9] check_exit_conditions_3 protected by lock->wait_lock. 902 * If there is no owner of the lock, end of chain. 903 */ 904 if (!rt_mutex_owner(lock)) { 905 raw_spin_unlock_irq(&lock->wait_lock); 906 return 0; 907 } 908 909 /* [10] Grab the next task, i.e. owner of @lock */ 910 task = get_task_struct(rt_mutex_owner(lock)); 911 raw_spin_lock(&task->pi_lock); 912 913 /* 914 * No requeue [11] here. We just do deadlock detection. 915 * 916 * [12] Store whether owner is blocked 917 * itself. Decision is made after dropping the locks 918 */ 919 next_lock = task_blocked_on_lock(task); 920 /* 921 * Get the top waiter for the next iteration 922 */ 923 top_waiter = rt_mutex_top_waiter(lock); 924 925 /* [13] Drop locks */ 926 raw_spin_unlock(&task->pi_lock); 927 raw_spin_unlock_irq(&lock->wait_lock); 928 929 /* If owner is not blocked, end of chain. */ 930 if (!next_lock) 931 goto out_put_task; 932 goto again; 933 } 934 935 /* 936 * Store the current top waiter before doing the requeue 937 * operation on @lock. We need it for the boost/deboost 938 * decision below. 939 */ 940 prerequeue_top_waiter = rt_mutex_top_waiter(lock); 941 942 /* [7] Requeue the waiter in the lock waiter tree. */ 943 rt_mutex_dequeue(lock, waiter); 944 945 /* 946 * Update the waiter prio fields now that we're dequeued. 947 * 948 * These values can have changed through either: 949 * 950 * sys_sched_set_scheduler() / sys_sched_setattr() 951 * 952 * or 953 * 954 * DL CBS enforcement advancing the effective deadline. 955 */ 956 waiter_update_prio(waiter, task); 957 958 rt_mutex_enqueue(lock, waiter); 959 960 /* 961 * [8] Release the (blocking) task in preparation for 962 * taking the owner task in [10]. 963 * 964 * Since we hold lock->waiter_lock, task cannot unblock, even if we 965 * release task->pi_lock. 966 */ 967 raw_spin_unlock(&task->pi_lock); 968 put_task_struct(task); 969 970 /* 971 * [9] check_exit_conditions_3 protected by lock->wait_lock. 972 * 973 * We must abort the chain walk if there is no lock owner even 974 * in the dead lock detection case, as we have nothing to 975 * follow here. This is the end of the chain we are walking. 976 */ 977 if (!rt_mutex_owner(lock)) { 978 /* 979 * If the requeue [7] above changed the top waiter, 980 * then we need to wake the new top waiter up to try 981 * to get the lock. 982 */ 983 top_waiter = rt_mutex_top_waiter(lock); 984 if (prerequeue_top_waiter != top_waiter) 985 wake_up_state(top_waiter->task, top_waiter->wake_state); 986 raw_spin_unlock_irq(&lock->wait_lock); 987 return 0; 988 } 989 990 /* 991 * [10] Grab the next task, i.e. the owner of @lock 992 * 993 * Per holding lock->wait_lock and checking for !owner above, there 994 * must be an owner and it cannot go away. 995 */ 996 task = get_task_struct(rt_mutex_owner(lock)); 997 raw_spin_lock(&task->pi_lock); 998 999 /* [11] requeue the pi waiters if necessary */ 1000 if (waiter == rt_mutex_top_waiter(lock)) { 1001 /* 1002 * The waiter became the new top (highest priority) 1003 * waiter on the lock. Replace the previous top waiter 1004 * in the owner tasks pi waiters tree with this waiter 1005 * and adjust the priority of the owner. 1006 */ 1007 rt_mutex_dequeue_pi(task, prerequeue_top_waiter); 1008 waiter_clone_prio(waiter, task); 1009 rt_mutex_enqueue_pi(task, waiter); 1010 rt_mutex_adjust_prio(lock, task); 1011 1012 } else if (prerequeue_top_waiter == waiter) { 1013 /* 1014 * The waiter was the top waiter on the lock, but is 1015 * no longer the top priority waiter. Replace waiter in 1016 * the owner tasks pi waiters tree with the new top 1017 * (highest priority) waiter and adjust the priority 1018 * of the owner. 1019 * The new top waiter is stored in @waiter so that 1020 * @waiter == @top_waiter evaluates to true below and 1021 * we continue to deboost the rest of the chain. 1022 */ 1023 rt_mutex_dequeue_pi(task, waiter); 1024 waiter = rt_mutex_top_waiter(lock); 1025 waiter_clone_prio(waiter, task); 1026 rt_mutex_enqueue_pi(task, waiter); 1027 rt_mutex_adjust_prio(lock, task); 1028 } else { 1029 /* 1030 * Nothing changed. No need to do any priority 1031 * adjustment. 1032 */ 1033 } 1034 1035 /* 1036 * [12] check_exit_conditions_4() protected by task->pi_lock 1037 * and lock->wait_lock. The actual decisions are made after we 1038 * dropped the locks. 1039 * 1040 * Check whether the task which owns the current lock is pi 1041 * blocked itself. If yes we store a pointer to the lock for 1042 * the lock chain change detection above. After we dropped 1043 * task->pi_lock next_lock cannot be dereferenced anymore. 1044 */ 1045 next_lock = task_blocked_on_lock(task); 1046 /* 1047 * Store the top waiter of @lock for the end of chain walk 1048 * decision below. 1049 */ 1050 top_waiter = rt_mutex_top_waiter(lock); 1051 1052 /* [13] Drop the locks */ 1053 raw_spin_unlock(&task->pi_lock); 1054 raw_spin_unlock_irq(&lock->wait_lock); 1055 1056 /* 1057 * Make the actual exit decisions [12], based on the stored 1058 * values. 1059 * 1060 * We reached the end of the lock chain. Stop right here. No 1061 * point to go back just to figure that out. 1062 */ 1063 if (!next_lock) 1064 goto out_put_task; 1065 1066 /* 1067 * If the current waiter is not the top waiter on the lock, 1068 * then we can stop the chain walk here if we are not in full 1069 * deadlock detection mode. 1070 */ 1071 if (!detect_deadlock && waiter != top_waiter) 1072 goto out_put_task; 1073 1074 goto again; 1075 1076 out_unlock_pi: 1077 raw_spin_unlock_irq(&task->pi_lock); 1078 out_put_task: 1079 put_task_struct(task); 1080 1081 return ret; 1082 } 1083 1084 /* 1085 * Try to take an rt-mutex 1086 * 1087 * Must be called with lock->wait_lock held and interrupts disabled 1088 * 1089 * @lock: The lock to be acquired. 1090 * @task: The task which wants to acquire the lock 1091 * @waiter: The waiter that is queued to the lock's wait tree if the 1092 * callsite called task_blocked_on_lock(), otherwise NULL 1093 */ 1094 static int __sched 1095 try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task, 1096 struct rt_mutex_waiter *waiter) 1097 __must_hold(&lock->wait_lock) 1098 { 1099 lockdep_assert_held(&lock->wait_lock); 1100 1101 /* 1102 * Before testing whether we can acquire @lock, we set the 1103 * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all 1104 * other tasks which try to modify @lock into the slow path 1105 * and they serialize on @lock->wait_lock. 1106 * 1107 * The RT_MUTEX_HAS_WAITERS bit can have a transitional state 1108 * as explained at the top of this file if and only if: 1109 * 1110 * - There is a lock owner. The caller must fixup the 1111 * transient state if it does a trylock or leaves the lock 1112 * function due to a signal or timeout. 1113 * 1114 * - @task acquires the lock and there are no other 1115 * waiters. This is undone in rt_mutex_set_owner(@task) at 1116 * the end of this function. 1117 */ 1118 mark_rt_mutex_waiters(lock); 1119 1120 /* 1121 * If @lock has an owner, give up. 1122 */ 1123 if (rt_mutex_owner(lock)) 1124 return 0; 1125 1126 /* 1127 * If @waiter != NULL, @task has already enqueued the waiter 1128 * into @lock waiter tree. If @waiter == NULL then this is a 1129 * trylock attempt. 1130 */ 1131 if (waiter) { 1132 struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock); 1133 1134 /* 1135 * If waiter is the highest priority waiter of @lock, 1136 * or allowed to steal it, take it over. 1137 */ 1138 if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter)) { 1139 /* 1140 * We can acquire the lock. Remove the waiter from the 1141 * lock waiters tree. 1142 */ 1143 rt_mutex_dequeue(lock, waiter); 1144 } else { 1145 return 0; 1146 } 1147 } else { 1148 /* 1149 * If the lock has waiters already we check whether @task is 1150 * eligible to take over the lock. 1151 * 1152 * If there are no other waiters, @task can acquire 1153 * the lock. @task->pi_blocked_on is NULL, so it does 1154 * not need to be dequeued. 1155 */ 1156 if (rt_mutex_has_waiters(lock)) { 1157 /* Check whether the trylock can steal it. */ 1158 if (!rt_mutex_steal(task_to_waiter(task), 1159 rt_mutex_top_waiter(lock))) 1160 return 0; 1161 1162 /* 1163 * The current top waiter stays enqueued. We 1164 * don't have to change anything in the lock 1165 * waiters order. 1166 */ 1167 } else { 1168 /* 1169 * No waiters. Take the lock without the 1170 * pi_lock dance.@task->pi_blocked_on is NULL 1171 * and we have no waiters to enqueue in @task 1172 * pi waiters tree. 1173 */ 1174 goto takeit; 1175 } 1176 } 1177 1178 /* 1179 * Clear @task->pi_blocked_on. Requires protection by 1180 * @task->pi_lock. Redundant operation for the @waiter == NULL 1181 * case, but conditionals are more expensive than a redundant 1182 * store. 1183 */ 1184 raw_spin_lock(&task->pi_lock); 1185 task->pi_blocked_on = NULL; 1186 /* 1187 * Finish the lock acquisition. @task is the new owner. If 1188 * other waiters exist we have to insert the highest priority 1189 * waiter into @task->pi_waiters tree. 1190 */ 1191 if (rt_mutex_has_waiters(lock)) 1192 rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock)); 1193 raw_spin_unlock(&task->pi_lock); 1194 1195 takeit: 1196 /* 1197 * This either preserves the RT_MUTEX_HAS_WAITERS bit if there 1198 * are still waiters or clears it. 1199 */ 1200 rt_mutex_set_owner(lock, task); 1201 1202 return 1; 1203 } 1204 1205 /* 1206 * Task blocks on lock. 1207 * 1208 * Prepare waiter and propagate pi chain 1209 * 1210 * This must be called with lock->wait_lock held and interrupts disabled 1211 */ 1212 static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, 1213 struct rt_mutex_waiter *waiter, 1214 struct task_struct *task, 1215 struct ww_acquire_ctx *ww_ctx, 1216 enum rtmutex_chainwalk chwalk, 1217 struct wake_q_head *wake_q) 1218 __must_hold(&lock->wait_lock) 1219 { 1220 struct task_struct *owner = rt_mutex_owner(lock); 1221 struct rt_mutex_waiter *top_waiter = waiter; 1222 struct rt_mutex_base *next_lock; 1223 int chain_walk = 0, res; 1224 1225 lockdep_assert_held(&lock->wait_lock); 1226 1227 /* 1228 * Early deadlock detection. We really don't want the task to 1229 * enqueue on itself just to untangle the mess later. It's not 1230 * only an optimization. We drop the locks, so another waiter 1231 * can come in before the chain walk detects the deadlock. So 1232 * the other will detect the deadlock and return -EDEADLOCK, 1233 * which is wrong, as the other waiter is not in a deadlock 1234 * situation. 1235 * 1236 * Except for ww_mutex, in that case the chain walk must already deal 1237 * with spurious cycles, see the comments at [3] and [6]. 1238 */ 1239 if (owner == task && !(build_ww_mutex() && ww_ctx)) 1240 return -EDEADLK; 1241 1242 raw_spin_lock(&task->pi_lock); 1243 waiter->task = task; 1244 waiter->lock = lock; 1245 waiter_update_prio(waiter, task); 1246 waiter_clone_prio(waiter, task); 1247 1248 /* Get the top priority waiter on the lock */ 1249 if (rt_mutex_has_waiters(lock)) 1250 top_waiter = rt_mutex_top_waiter(lock); 1251 rt_mutex_enqueue(lock, waiter); 1252 1253 task->pi_blocked_on = waiter; 1254 1255 raw_spin_unlock(&task->pi_lock); 1256 1257 if (build_ww_mutex() && ww_ctx) { 1258 struct rt_mutex *rtm; 1259 1260 /* Check whether the waiter should back out immediately */ 1261 rtm = container_of(lock, struct rt_mutex, rtmutex); 1262 __assume_ctx_lock(&rtm->rtmutex.wait_lock); 1263 res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q); 1264 if (res) { 1265 raw_spin_lock(&task->pi_lock); 1266 rt_mutex_dequeue(lock, waiter); 1267 task->pi_blocked_on = NULL; 1268 raw_spin_unlock(&task->pi_lock); 1269 return res; 1270 } 1271 } 1272 1273 if (!owner) 1274 return 0; 1275 1276 raw_spin_lock(&owner->pi_lock); 1277 if (waiter == rt_mutex_top_waiter(lock)) { 1278 rt_mutex_dequeue_pi(owner, top_waiter); 1279 rt_mutex_enqueue_pi(owner, waiter); 1280 1281 rt_mutex_adjust_prio(lock, owner); 1282 if (owner->pi_blocked_on) 1283 chain_walk = 1; 1284 } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { 1285 chain_walk = 1; 1286 } 1287 1288 /* Store the lock on which owner is blocked or NULL */ 1289 next_lock = task_blocked_on_lock(owner); 1290 1291 raw_spin_unlock(&owner->pi_lock); 1292 /* 1293 * Even if full deadlock detection is on, if the owner is not 1294 * blocked itself, we can avoid finding this out in the chain 1295 * walk. 1296 */ 1297 if (!chain_walk || !next_lock) 1298 return 0; 1299 1300 /* 1301 * The owner can't disappear while holding a lock, 1302 * so the owner struct is protected by wait_lock. 1303 * Gets dropped in rt_mutex_adjust_prio_chain()! 1304 */ 1305 get_task_struct(owner); 1306 1307 raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); 1308 1309 res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, 1310 next_lock, waiter, task); 1311 1312 raw_spin_lock_irq(&lock->wait_lock); 1313 1314 return res; 1315 } 1316 1317 /* 1318 * Remove the top waiter from the current tasks pi waiter tree and 1319 * queue it up. 1320 * 1321 * Called with lock->wait_lock held and interrupts disabled. 1322 */ 1323 static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, 1324 struct rt_mutex_base *lock) 1325 __must_hold(&lock->wait_lock) 1326 { 1327 struct rt_mutex_waiter *waiter; 1328 1329 lockdep_assert_held(&lock->wait_lock); 1330 1331 raw_spin_lock(¤t->pi_lock); 1332 1333 waiter = rt_mutex_top_waiter(lock); 1334 1335 /* 1336 * Remove it from current->pi_waiters and deboost. 1337 * 1338 * We must in fact deboost here in order to ensure we call 1339 * rt_mutex_setprio() to update p->pi_top_task before the 1340 * task unblocks. 1341 */ 1342 rt_mutex_dequeue_pi(current, waiter); 1343 rt_mutex_adjust_prio(lock, current); 1344 1345 /* 1346 * As we are waking up the top waiter, and the waiter stays 1347 * queued on the lock until it gets the lock, this lock 1348 * obviously has waiters. Just set the bit here and this has 1349 * the added benefit of forcing all new tasks into the 1350 * slow path making sure no task of lower priority than 1351 * the top waiter can steal this lock. 1352 */ 1353 lock->owner = (void *) RT_MUTEX_HAS_WAITERS; 1354 1355 /* 1356 * We deboosted before waking the top waiter task such that we don't 1357 * run two tasks with the 'same' priority (and ensure the 1358 * p->pi_top_task pointer points to a blocked task). This however can 1359 * lead to priority inversion if we would get preempted after the 1360 * deboost but before waking our donor task, hence the preempt_disable() 1361 * before unlock. 1362 * 1363 * Pairs with preempt_enable() in rt_mutex_wake_up_q(); 1364 */ 1365 preempt_disable(); 1366 rt_mutex_wake_q_add(wqh, waiter); 1367 raw_spin_unlock(¤t->pi_lock); 1368 } 1369 1370 static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock) 1371 __must_hold(&lock->wait_lock) 1372 { 1373 int ret = try_to_take_rt_mutex(lock, current, NULL); 1374 1375 /* 1376 * try_to_take_rt_mutex() sets the lock waiters bit 1377 * unconditionally. Clean this up. 1378 */ 1379 fixup_rt_mutex_waiters(lock, true); 1380 1381 return ret; 1382 } 1383 1384 /* 1385 * Slow path try-lock function: 1386 */ 1387 static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock) 1388 { 1389 unsigned long flags; 1390 int ret; 1391 1392 /* 1393 * If the lock already has an owner we fail to get the lock. 1394 * This can be done without taking the @lock->wait_lock as 1395 * it is only being read, and this is a trylock anyway. 1396 */ 1397 if (rt_mutex_owner(lock)) 1398 return 0; 1399 1400 /* 1401 * The mutex has currently no owner. Lock the wait lock and try to 1402 * acquire the lock. We use irqsave here to support early boot calls. 1403 */ 1404 raw_spin_lock_irqsave(&lock->wait_lock, flags); 1405 1406 ret = __rt_mutex_slowtrylock(lock); 1407 1408 raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 1409 1410 return ret; 1411 } 1412 1413 static __always_inline int __rt_mutex_trylock(struct rt_mutex_base *lock) 1414 { 1415 if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) 1416 return 1; 1417 1418 return rt_mutex_slowtrylock(lock); 1419 } 1420 1421 /* 1422 * Slow path to release a rt-mutex. 1423 */ 1424 static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock) 1425 { 1426 DEFINE_RT_WAKE_Q(wqh); 1427 unsigned long flags; 1428 1429 /* irqsave required to support early boot calls */ 1430 raw_spin_lock_irqsave(&lock->wait_lock, flags); 1431 1432 debug_rt_mutex_unlock(lock); 1433 1434 /* 1435 * We must be careful here if the fast path is enabled. If we 1436 * have no waiters queued we cannot set owner to NULL here 1437 * because of: 1438 * 1439 * foo->lock->owner = NULL; 1440 * rtmutex_lock(foo->lock); <- fast path 1441 * free = atomic_dec_and_test(foo->refcnt); 1442 * rtmutex_unlock(foo->lock); <- fast path 1443 * if (free) 1444 * kfree(foo); 1445 * raw_spin_unlock(foo->lock->wait_lock); 1446 * 1447 * So for the fastpath enabled kernel: 1448 * 1449 * Nothing can set the waiters bit as long as we hold 1450 * lock->wait_lock. So we do the following sequence: 1451 * 1452 * owner = rt_mutex_owner(lock); 1453 * clear_rt_mutex_waiters(lock); 1454 * raw_spin_unlock(&lock->wait_lock); 1455 * if (cmpxchg(&lock->owner, owner, 0) == owner) 1456 * return; 1457 * goto retry; 1458 * 1459 * The fastpath disabled variant is simple as all access to 1460 * lock->owner is serialized by lock->wait_lock: 1461 * 1462 * lock->owner = NULL; 1463 * raw_spin_unlock(&lock->wait_lock); 1464 */ 1465 while (!rt_mutex_has_waiters(lock)) { 1466 /* Drops lock->wait_lock ! */ 1467 if (unlock_rt_mutex_safe(lock, flags) == true) 1468 return; 1469 /* Relock the rtmutex and try again */ 1470 raw_spin_lock_irqsave(&lock->wait_lock, flags); 1471 } 1472 1473 trace_contended_release(lock); 1474 /* 1475 * The wakeup next waiter path does not suffer from the above 1476 * race. See the comments there. 1477 * 1478 * Queue the next waiter for wakeup once we release the wait_lock. 1479 */ 1480 mark_wakeup_next_waiter(&wqh, lock); 1481 raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 1482 1483 rt_mutex_wake_up_q(&wqh); 1484 } 1485 1486 static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock) 1487 { 1488 if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) 1489 return; 1490 1491 rt_mutex_slowunlock(lock); 1492 } 1493 1494 #ifdef CONFIG_SMP 1495 static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, 1496 struct rt_mutex_waiter *waiter, 1497 struct task_struct *owner) 1498 { 1499 bool res = true; 1500 1501 rcu_read_lock(); 1502 for (;;) { 1503 /* If owner changed, trylock again. */ 1504 if (owner != rt_mutex_owner(lock)) 1505 break; 1506 /* 1507 * Ensure that @owner is dereferenced after checking that 1508 * the lock owner still matches @owner. If that fails, 1509 * @owner might point to freed memory. If it still matches, 1510 * the rcu_read_lock() ensures the memory stays valid. 1511 */ 1512 barrier(); 1513 /* 1514 * Stop spinning when: 1515 * - the lock owner has been scheduled out 1516 * - current is not longer the top waiter 1517 * - current is requested to reschedule (redundant 1518 * for CONFIG_PREEMPT_RCU=y) 1519 * - the VCPU on which owner runs is preempted 1520 */ 1521 if (!owner_on_cpu(owner) || need_resched() || 1522 !data_race(rt_mutex_waiter_is_top_waiter(lock, waiter))) { 1523 res = false; 1524 break; 1525 } 1526 cpu_relax(); 1527 } 1528 rcu_read_unlock(); 1529 return res; 1530 } 1531 #else 1532 static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, 1533 struct rt_mutex_waiter *waiter, 1534 struct task_struct *owner) 1535 { 1536 return false; 1537 } 1538 #endif 1539 1540 #ifdef RT_MUTEX_BUILD_MUTEX 1541 /* 1542 * Functions required for: 1543 * - rtmutex, futex on all kernels 1544 * - mutex and rwsem substitutions on RT kernels 1545 */ 1546 1547 /* 1548 * Remove a waiter from a lock and give up 1549 * 1550 * Must be called with lock->wait_lock held and interrupts disabled. It must 1551 * have just failed to try_to_take_rt_mutex(). 1552 * 1553 * When invoked from rt_mutex_start_proxy_lock() waiter::task != current ! 1554 */ 1555 static void __sched remove_waiter(struct rt_mutex_base *lock, 1556 struct rt_mutex_waiter *waiter) 1557 __must_hold(&lock->wait_lock) 1558 { 1559 bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); 1560 struct task_struct *owner = rt_mutex_owner(lock); 1561 struct task_struct *waiter_task = waiter->task; 1562 struct rt_mutex_base *next_lock; 1563 1564 lockdep_assert_held(&lock->wait_lock); 1565 1566 if (!waiter_task) /* never enqueued */ 1567 return; 1568 1569 scoped_guard(raw_spinlock, &waiter_task->pi_lock) { 1570 rt_mutex_dequeue(lock, waiter); 1571 waiter_task->pi_blocked_on = NULL; 1572 } 1573 1574 /* 1575 * Only update priority if the waiter was the highest priority 1576 * waiter of the lock and there is an owner to update. 1577 */ 1578 if (!owner || !is_top_waiter) 1579 return; 1580 1581 raw_spin_lock(&owner->pi_lock); 1582 1583 rt_mutex_dequeue_pi(owner, waiter); 1584 1585 if (rt_mutex_has_waiters(lock)) 1586 rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); 1587 1588 rt_mutex_adjust_prio(lock, owner); 1589 1590 /* Store the lock on which owner is blocked or NULL */ 1591 next_lock = task_blocked_on_lock(owner); 1592 1593 raw_spin_unlock(&owner->pi_lock); 1594 1595 /* 1596 * Don't walk the chain, if the owner task is not blocked 1597 * itself. 1598 */ 1599 if (!next_lock) 1600 return; 1601 1602 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 1603 get_task_struct(owner); 1604 1605 raw_spin_unlock_irq(&lock->wait_lock); 1606 1607 rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, 1608 next_lock, NULL, waiter_task); 1609 1610 raw_spin_lock_irq(&lock->wait_lock); 1611 } 1612 1613 /** 1614 * rt_mutex_slowlock_block() - Perform the wait-wake-try-to-take loop 1615 * @lock: the rt_mutex to take 1616 * @ww_ctx: WW mutex context pointer 1617 * @state: the state the task should block in (TASK_INTERRUPTIBLE 1618 * or TASK_UNINTERRUPTIBLE) 1619 * @timeout: the pre-initialized and started timer, or NULL for none 1620 * @waiter: the pre-initialized rt_mutex_waiter 1621 * @wake_q: wake_q of tasks to wake when we drop the lock->wait_lock 1622 * 1623 * Must be called with lock->wait_lock held and interrupts disabled 1624 */ 1625 static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, 1626 struct ww_acquire_ctx *ww_ctx, 1627 unsigned int state, 1628 struct hrtimer_sleeper *timeout, 1629 struct rt_mutex_waiter *waiter, 1630 struct wake_q_head *wake_q) 1631 __releases(&lock->wait_lock) __acquires(&lock->wait_lock) 1632 { 1633 struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); 1634 struct task_struct *owner; 1635 int ret = 0; 1636 1637 __assume_ctx_lock(&rtm->rtmutex.wait_lock); 1638 1639 lockevent_inc(rtmutex_slow_block); 1640 for (;;) { 1641 /* Try to acquire the lock: */ 1642 if (try_to_take_rt_mutex(lock, current, waiter)) { 1643 lockevent_inc(rtmutex_slow_acq3); 1644 break; 1645 } 1646 1647 if (timeout && !timeout->task) { 1648 ret = -ETIMEDOUT; 1649 break; 1650 } 1651 if (signal_pending_state(state, current)) { 1652 ret = -EINTR; 1653 break; 1654 } 1655 1656 if (build_ww_mutex() && ww_ctx) { 1657 ret = __ww_mutex_check_kill(rtm, waiter, ww_ctx); 1658 if (ret) 1659 break; 1660 } 1661 1662 if (waiter == rt_mutex_top_waiter(lock)) 1663 owner = rt_mutex_owner(lock); 1664 else 1665 owner = NULL; 1666 raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); 1667 1668 if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) { 1669 lockevent_inc(rtmutex_slow_sleep); 1670 rt_mutex_schedule(); 1671 } 1672 1673 raw_spin_lock_irq(&lock->wait_lock); 1674 set_current_state(state); 1675 } 1676 1677 __set_current_state(TASK_RUNNING); 1678 return ret; 1679 } 1680 1681 static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, 1682 struct rt_mutex_base *lock, 1683 struct rt_mutex_waiter *w) 1684 __must_hold(&lock->wait_lock) 1685 { 1686 /* 1687 * If the result is not -EDEADLOCK or the caller requested 1688 * deadlock detection, nothing to do here. 1689 */ 1690 if (res != -EDEADLOCK || detect_deadlock) 1691 return; 1692 1693 if (build_ww_mutex() && w->ww_ctx) 1694 return; 1695 1696 raw_spin_unlock_irq(&lock->wait_lock); 1697 1698 WARN(1, "rtmutex deadlock detected\n"); 1699 1700 while (1) { 1701 set_current_state(TASK_INTERRUPTIBLE); 1702 rt_mutex_schedule(); 1703 } 1704 } 1705 1706 /** 1707 * __rt_mutex_slowlock - Locking slowpath invoked with lock::wait_lock held 1708 * @lock: The rtmutex to block lock 1709 * @ww_ctx: WW mutex context pointer 1710 * @state: The task state for sleeping 1711 * @chwalk: Indicator whether full or partial chainwalk is requested 1712 * @waiter: Initializer waiter for blocking 1713 * @wake_q: The wake_q to wake tasks after we release the wait_lock 1714 */ 1715 static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, 1716 struct ww_acquire_ctx *ww_ctx, 1717 unsigned int state, 1718 enum rtmutex_chainwalk chwalk, 1719 struct rt_mutex_waiter *waiter, 1720 struct wake_q_head *wake_q) 1721 __must_hold(&lock->wait_lock) 1722 { 1723 struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); 1724 struct ww_mutex *ww = ww_container_of(rtm); 1725 int ret; 1726 1727 __assume_ctx_lock(&rtm->rtmutex.wait_lock); 1728 lockdep_assert_held(&lock->wait_lock); 1729 lockevent_inc(rtmutex_slowlock); 1730 1731 /* Try to acquire the lock again: */ 1732 if (try_to_take_rt_mutex(lock, current, NULL)) { 1733 if (build_ww_mutex() && ww_ctx) { 1734 __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); 1735 ww_mutex_lock_acquired(ww, ww_ctx); 1736 } 1737 lockevent_inc(rtmutex_slow_acq1); 1738 return 0; 1739 } 1740 1741 set_current_state(state); 1742 1743 trace_contention_begin(lock, LCB_F_RT); 1744 1745 ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk, wake_q); 1746 if (likely(!ret)) 1747 ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter, wake_q); 1748 1749 if (likely(!ret)) { 1750 /* acquired the lock */ 1751 if (build_ww_mutex() && ww_ctx) { 1752 if (!ww_ctx->is_wait_die) 1753 __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); 1754 ww_mutex_lock_acquired(ww, ww_ctx); 1755 } 1756 lockevent_inc(rtmutex_slow_acq2); 1757 } else { 1758 __set_current_state(TASK_RUNNING); 1759 remove_waiter(lock, waiter); 1760 rt_mutex_handle_deadlock(ret, chwalk, lock, waiter); 1761 lockevent_inc(rtmutex_deadlock); 1762 } 1763 1764 /* 1765 * try_to_take_rt_mutex() sets the waiter bit 1766 * unconditionally. We might have to fix that up. 1767 */ 1768 fixup_rt_mutex_waiters(lock, true); 1769 1770 trace_contention_end(lock, ret); 1771 1772 return ret; 1773 } 1774 1775 static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, 1776 struct ww_acquire_ctx *ww_ctx, 1777 unsigned int state, 1778 struct wake_q_head *wake_q) 1779 __must_hold(&lock->wait_lock) 1780 { 1781 struct rt_mutex_waiter waiter; 1782 int ret; 1783 1784 rt_mutex_init_waiter(&waiter); 1785 waiter.ww_ctx = ww_ctx; 1786 1787 ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK, 1788 &waiter, wake_q); 1789 1790 debug_rt_mutex_free_waiter(&waiter); 1791 lockevent_cond_inc(rtmutex_slow_wake, !wake_q_empty(wake_q)); 1792 return ret; 1793 } 1794 1795 /* 1796 * rt_mutex_slowlock - Locking slowpath invoked when fast path fails 1797 * @lock: The rtmutex to block lock 1798 * @ww_ctx: WW mutex context pointer 1799 * @state: The task state for sleeping 1800 */ 1801 static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, 1802 struct ww_acquire_ctx *ww_ctx, 1803 unsigned int state) 1804 { 1805 DEFINE_WAKE_Q(wake_q); 1806 unsigned long flags; 1807 int ret; 1808 1809 /* 1810 * Do all pre-schedule work here, before we queue a waiter and invoke 1811 * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would 1812 * otherwise recurse back into task_blocks_on_rt_mutex() through 1813 * rtlock_slowlock() and will then enqueue a second waiter for this 1814 * same task and things get really confusing real fast. 1815 */ 1816 rt_mutex_pre_schedule(); 1817 1818 /* 1819 * Technically we could use raw_spin_[un]lock_irq() here, but this can 1820 * be called in early boot if the cmpxchg() fast path is disabled 1821 * (debug, no architecture support). In this case we will acquire the 1822 * rtmutex with lock->wait_lock held. But we cannot unconditionally 1823 * enable interrupts in that early boot case. So we need to use the 1824 * irqsave/restore variants. 1825 */ 1826 raw_spin_lock_irqsave(&lock->wait_lock, flags); 1827 ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state, &wake_q); 1828 raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); 1829 rt_mutex_post_schedule(); 1830 1831 return ret; 1832 } 1833 1834 static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, 1835 unsigned int state) 1836 { 1837 lockdep_assert(!current->pi_blocked_on); 1838 1839 if (likely(rt_mutex_try_acquire(lock))) 1840 return 0; 1841 1842 return rt_mutex_slowlock(lock, NULL, state); 1843 } 1844 #endif /* RT_MUTEX_BUILD_MUTEX */ 1845 1846 #ifdef RT_MUTEX_BUILD_SPINLOCKS 1847 /* 1848 * Functions required for spin/rw_lock substitution on RT kernels 1849 */ 1850 1851 /** 1852 * rtlock_slowlock_locked - Slow path lock acquisition for RT locks 1853 * @lock: The underlying RT mutex 1854 * @wake_q: The wake_q to wake tasks after we release the wait_lock 1855 */ 1856 static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, 1857 struct wake_q_head *wake_q) 1858 __releases(&lock->wait_lock) __acquires(&lock->wait_lock) 1859 { 1860 struct rt_mutex_waiter waiter; 1861 struct task_struct *owner; 1862 1863 lockdep_assert_held(&lock->wait_lock); 1864 lockevent_inc(rtlock_slowlock); 1865 1866 if (try_to_take_rt_mutex(lock, current, NULL)) { 1867 lockevent_inc(rtlock_slow_acq1); 1868 return; 1869 } 1870 1871 rt_mutex_init_rtlock_waiter(&waiter); 1872 1873 /* Save current state and set state to TASK_RTLOCK_WAIT */ 1874 current_save_and_set_rtlock_wait_state(); 1875 1876 trace_contention_begin(lock, LCB_F_RT); 1877 1878 task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK, wake_q); 1879 1880 for (;;) { 1881 /* Try to acquire the lock again */ 1882 if (try_to_take_rt_mutex(lock, current, &waiter)) { 1883 lockevent_inc(rtlock_slow_acq2); 1884 break; 1885 } 1886 1887 if (&waiter == rt_mutex_top_waiter(lock)) 1888 owner = rt_mutex_owner(lock); 1889 else 1890 owner = NULL; 1891 raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); 1892 1893 if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) { 1894 lockevent_inc(rtlock_slow_sleep); 1895 schedule_rtlock(); 1896 } 1897 1898 raw_spin_lock_irq(&lock->wait_lock); 1899 set_current_state(TASK_RTLOCK_WAIT); 1900 } 1901 1902 /* Restore the task state */ 1903 current_restore_rtlock_saved_state(); 1904 1905 /* 1906 * try_to_take_rt_mutex() sets the waiter bit unconditionally. 1907 * We might have to fix that up: 1908 */ 1909 fixup_rt_mutex_waiters(lock, true); 1910 debug_rt_mutex_free_waiter(&waiter); 1911 1912 trace_contention_end(lock, 0); 1913 lockevent_cond_inc(rtlock_slow_wake, !wake_q_empty(wake_q)); 1914 } 1915 1916 static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) 1917 { 1918 unsigned long flags; 1919 DEFINE_WAKE_Q(wake_q); 1920 1921 raw_spin_lock_irqsave(&lock->wait_lock, flags); 1922 rtlock_slowlock_locked(lock, &wake_q); 1923 raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); 1924 } 1925 1926 #endif /* RT_MUTEX_BUILD_SPINLOCKS */ 1927