1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Resilient Queued Spin Lock 4 * 5 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. 6 * (C) Copyright 2013-2014,2018 Red Hat, Inc. 7 * (C) Copyright 2015 Intel Corp. 8 * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP 9 * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. 10 * 11 * Authors: Waiman Long <longman@redhat.com> 12 * Peter Zijlstra <peterz@infradead.org> 13 * Kumar Kartikeya Dwivedi <memxor@gmail.com> 14 */ 15 16 #include <linux/smp.h> 17 #include <linux/bug.h> 18 #include <linux/bpf.h> 19 #include <linux/err.h> 20 #include <linux/cpumask.h> 21 #include <linux/percpu.h> 22 #include <linux/hardirq.h> 23 #include <linux/mutex.h> 24 #include <linux/prefetch.h> 25 #include <asm/byteorder.h> 26 #ifdef CONFIG_QUEUED_SPINLOCKS 27 #include <asm/qspinlock.h> 28 #endif 29 #include <trace/events/lock.h> 30 #include <asm/rqspinlock.h> 31 #include <linux/timekeeping.h> 32 33 /* 34 * Include queued spinlock definitions and statistics code 35 */ 36 #ifdef CONFIG_QUEUED_SPINLOCKS 37 #include "../locking/qspinlock.h" 38 #include "../locking/lock_events.h" 39 #include "rqspinlock.h" 40 #include "../locking/mcs_spinlock.h" 41 #endif 42 43 /* 44 * The basic principle of a queue-based spinlock can best be understood 45 * by studying a classic queue-based spinlock implementation called the 46 * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable 47 * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and 48 * Scott") is available at 49 * 50 * https://bugzilla.kernel.org/show_bug.cgi?id=206115 51 * 52 * This queued spinlock implementation is based on the MCS lock, however to 53 * make it fit the 4 bytes we assume spinlock_t to be, and preserve its 54 * existing API, we must modify it somehow. 55 * 56 * In particular; where the traditional MCS lock consists of a tail pointer 57 * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to 58 * unlock the next pending (next->locked), we compress both these: {tail, 59 * next->locked} into a single u32 value. 60 * 61 * Since a spinlock disables recursion of its own context and there is a limit 62 * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there 63 * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now 64 * we can encode the tail by combining the 2-bit nesting level with the cpu 65 * number. With one byte for the lock value and 3 bytes for the tail, only a 66 * 32-bit word is now needed. Even though we only need 1 bit for the lock, 67 * we extend it to a full byte to achieve better performance for architectures 68 * that support atomic byte write. 69 * 70 * We also change the first spinner to spin on the lock bit instead of its 71 * node; whereby avoiding the need to carry a node from lock to unlock, and 72 * preserving existing lock API. This also makes the unlock code simpler and 73 * faster. 74 * 75 * N.B. The current implementation only supports architectures that allow 76 * atomic operations on smaller 8-bit and 16-bit data types. 77 * 78 */ 79 80 struct rqspinlock_timeout { 81 u64 timeout_end; 82 u64 duration; 83 u64 cur; 84 u16 spin; 85 }; 86 87 #define RES_TIMEOUT_VAL 2 88 89 DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); 90 EXPORT_SYMBOL_GPL(rqspinlock_held_locks); 91 92 static bool is_lock_released(rqspinlock_t *lock, u32 mask) 93 { 94 if (!(atomic_read_acquire(&lock->val) & (mask))) 95 return true; 96 return false; 97 } 98 99 static noinline int check_deadlock_AA(rqspinlock_t *lock) 100 { 101 struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 102 int cnt = min(RES_NR_HELD, rqh->cnt); 103 104 /* 105 * Return an error if we hold the lock we are attempting to acquire. 106 * We'll iterate over max 32 locks; no need to do is_lock_released. 107 */ 108 for (int i = 0; i < cnt - 1; i++) { 109 if (rqh->locks[i] == lock) 110 return -EDEADLK; 111 } 112 return 0; 113 } 114 115 /* 116 * This focuses on the most common case of ABBA deadlocks (or ABBA involving 117 * more locks, which reduce to ABBA). This is not exhaustive, and we rely on 118 * timeouts as the final line of defense. 119 */ 120 static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask) 121 { 122 struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 123 int rqh_cnt = min(RES_NR_HELD, rqh->cnt); 124 void *remote_lock; 125 int cpu; 126 127 /* 128 * Find the CPU holding the lock that we want to acquire. If there is a 129 * deadlock scenario, we will read a stable set on the remote CPU and 130 * find the target. This would be a constant time operation instead of 131 * O(NR_CPUS) if we could determine the owning CPU from a lock value, but 132 * that requires increasing the size of the lock word. 133 */ 134 for_each_possible_cpu(cpu) { 135 struct rqspinlock_held *rqh_cpu = per_cpu_ptr(&rqspinlock_held_locks, cpu); 136 int real_cnt = READ_ONCE(rqh_cpu->cnt); 137 int cnt = min(RES_NR_HELD, real_cnt); 138 139 /* 140 * Let's ensure to break out of this loop if the lock is available for 141 * us to potentially acquire. 142 */ 143 if (is_lock_released(lock, mask)) 144 return 0; 145 146 /* 147 * Skip ourselves, and CPUs whose count is less than 2, as they need at 148 * least one held lock and one acquisition attempt (reflected as top 149 * most entry) to participate in an ABBA deadlock. 150 * 151 * If cnt is more than RES_NR_HELD, it means the current lock being 152 * acquired won't appear in the table, and other locks in the table are 153 * already held, so we can't determine ABBA. 154 */ 155 if (cpu == smp_processor_id() || real_cnt < 2 || real_cnt > RES_NR_HELD) 156 continue; 157 158 /* 159 * Obtain the entry at the top, this corresponds to the lock the 160 * remote CPU is attempting to acquire in a deadlock situation, 161 * and would be one of the locks we hold on the current CPU. 162 */ 163 remote_lock = READ_ONCE(rqh_cpu->locks[cnt - 1]); 164 /* 165 * If it is NULL, we've raced and cannot determine a deadlock 166 * conclusively, skip this CPU. 167 */ 168 if (!remote_lock) 169 continue; 170 /* 171 * Find if the lock we're attempting to acquire is held by this CPU. 172 * Don't consider the topmost entry, as that must be the latest lock 173 * being held or acquired. For a deadlock, the target CPU must also 174 * attempt to acquire a lock we hold, so for this search only 'cnt - 1' 175 * entries are important. 176 */ 177 for (int i = 0; i < cnt - 1; i++) { 178 if (READ_ONCE(rqh_cpu->locks[i]) != lock) 179 continue; 180 /* 181 * We found our lock as held on the remote CPU. Is the 182 * acquisition attempt on the remote CPU for a lock held 183 * by us? If so, we have a deadlock situation, and need 184 * to recover. 185 */ 186 for (int i = 0; i < rqh_cnt - 1; i++) { 187 if (rqh->locks[i] == remote_lock) 188 return -EDEADLK; 189 } 190 /* 191 * Inconclusive; retry again later. 192 */ 193 return 0; 194 } 195 } 196 return 0; 197 } 198 199 static noinline int check_timeout(rqspinlock_t *lock, u32 mask, 200 struct rqspinlock_timeout *ts) 201 { 202 u64 prev = ts->cur; 203 u64 time; 204 205 if (!ts->timeout_end) { 206 if (check_deadlock_AA(lock)) 207 return -EDEADLK; 208 ts->cur = ktime_get_mono_fast_ns(); 209 ts->timeout_end = ts->cur + ts->duration; 210 return 0; 211 } 212 213 time = ktime_get_mono_fast_ns(); 214 if (time > ts->timeout_end) 215 return -ETIMEDOUT; 216 217 /* 218 * A millisecond interval passed from last time? Trigger deadlock 219 * checks. 220 */ 221 if (prev + NSEC_PER_MSEC < time) { 222 ts->cur = time; 223 return check_deadlock_ABBA(lock, mask); 224 } 225 226 return 0; 227 } 228 229 /* 230 * Do not amortize with spins when res_smp_cond_load_acquire is defined, 231 * as the macro does internal amortization for us. 232 */ 233 #ifndef res_smp_cond_load_acquire 234 #define RES_CHECK_TIMEOUT(ts, ret, mask) \ 235 ({ \ 236 if (!(ts).spin++) \ 237 (ret) = check_timeout((lock), (mask), &(ts)); \ 238 (ret); \ 239 }) 240 #else 241 #define RES_CHECK_TIMEOUT(ts, ret, mask) \ 242 ({ (ret) = check_timeout((lock), (mask), &(ts)); }) 243 #endif 244 245 /* 246 * Initialize the 'spin' member. 247 * Set spin member to 0 to trigger AA/ABBA checks immediately. 248 */ 249 #define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; }) 250 251 /* 252 * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary. 253 * Duration is defined for each spin attempt, so set it here. 254 */ 255 #define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; }) 256 257 /* 258 * Provide a test-and-set fallback for cases when queued spin lock support is 259 * absent from the architecture. 260 */ 261 int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) 262 { 263 struct rqspinlock_timeout ts; 264 int val, ret = 0; 265 266 RES_INIT_TIMEOUT(ts); 267 /* 268 * We are either called directly from res_spin_lock after grabbing the 269 * deadlock detection entry when queued spinlocks are disabled, or from 270 * resilient_queued_spin_lock_slowpath after grabbing the deadlock 271 * detection entry. No need to obtain it here. 272 */ 273 274 /* 275 * Since the waiting loop's time is dependent on the amount of 276 * contention, a short timeout unlike rqspinlock waiting loops 277 * isn't enough. Choose a second as the timeout value. 278 */ 279 RES_RESET_TIMEOUT(ts, NSEC_PER_SEC); 280 retry: 281 val = atomic_read(&lock->val); 282 283 if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) { 284 if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) 285 goto out; 286 cpu_relax(); 287 goto retry; 288 } 289 290 return 0; 291 out: 292 release_held_lock_entry(); 293 return ret; 294 } 295 EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); 296 297 #ifdef CONFIG_QUEUED_SPINLOCKS 298 299 /* 300 * Per-CPU queue node structures; we can never have more than 4 nested 301 * contexts: task, softirq, hardirq, nmi. 302 * 303 * Exactly fits one 64-byte cacheline on a 64-bit architecture. 304 */ 305 static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); 306 307 #ifndef res_smp_cond_load_acquire 308 #define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) 309 #endif 310 311 #define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) 312 313 /** 314 * resilient_queued_spin_lock_slowpath - acquire the queued spinlock 315 * @lock: Pointer to queued spinlock structure 316 * @val: Current value of the queued spinlock 32-bit word 317 * 318 * Return: 319 * * 0 - Lock was acquired successfully. 320 * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. 321 * * -ETIMEDOUT - Lock acquisition failed because of timeout. 322 * 323 * (queue tail, pending bit, lock value) 324 * 325 * fast : slow : unlock 326 * : : 327 * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) 328 * : | ^--------.------. / : 329 * : v \ \ | : 330 * pending : (0,1,1) +--> (0,1,0) \ | : 331 * : | ^--' | | : 332 * : v | | : 333 * uncontended : (n,x,y) +--> (n,0,0) --' | : 334 * queue : | ^--' | : 335 * : v | : 336 * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : 337 * queue : ^--' : 338 */ 339 int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) 340 { 341 struct mcs_spinlock *prev, *next, *node; 342 struct rqspinlock_timeout ts; 343 int idx, ret = 0; 344 u32 old, tail; 345 346 BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); 347 348 if (resilient_virt_spin_lock_enabled()) 349 return resilient_virt_spin_lock(lock); 350 351 RES_INIT_TIMEOUT(ts); 352 353 /* 354 * Wait for in-progress pending->locked hand-overs with a bounded 355 * number of spins so that we guarantee forward progress. 356 * 357 * 0,1,0 -> 0,0,1 358 */ 359 if (val == _Q_PENDING_VAL) { 360 int cnt = _Q_PENDING_LOOPS; 361 val = atomic_cond_read_relaxed(&lock->val, 362 (VAL != _Q_PENDING_VAL) || !cnt--); 363 } 364 365 /* 366 * If we observe any contention; queue. 367 */ 368 if (val & ~_Q_LOCKED_MASK) 369 goto queue; 370 371 /* 372 * trylock || pending 373 * 374 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock 375 */ 376 val = queued_fetch_set_pending_acquire(lock); 377 378 /* 379 * If we observe contention, there is a concurrent locker. 380 * 381 * Undo and queue; our setting of PENDING might have made the 382 * n,0,0 -> 0,0,0 transition fail and it will now be waiting 383 * on @next to become !NULL. 384 */ 385 if (unlikely(val & ~_Q_LOCKED_MASK)) { 386 387 /* Undo PENDING if we set it. */ 388 if (!(val & _Q_PENDING_MASK)) 389 clear_pending(lock); 390 391 goto queue; 392 } 393 394 /* Deadlock detection entry already held after failing fast path. */ 395 396 /* 397 * We're pending, wait for the owner to go away. 398 * 399 * 0,1,1 -> *,1,0 400 * 401 * this wait loop must be a load-acquire such that we match the 402 * store-release that clears the locked bit and create lock 403 * sequentiality; this is because not all 404 * clear_pending_set_locked() implementations imply full 405 * barriers. 406 */ 407 if (val & _Q_LOCKED_MASK) { 408 RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); 409 res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK)); 410 } 411 412 if (ret) { 413 /* 414 * We waited for the locked bit to go back to 0, as the pending 415 * waiter, but timed out. We need to clear the pending bit since 416 * we own it. Once a stuck owner has been recovered, the lock 417 * must be restored to a valid state, hence removing the pending 418 * bit is necessary. 419 * 420 * *,1,* -> *,0,* 421 */ 422 clear_pending(lock); 423 lockevent_inc(rqspinlock_lock_timeout); 424 goto err_release_entry; 425 } 426 427 /* 428 * take ownership and clear the pending bit. 429 * 430 * 0,1,0 -> 0,0,1 431 */ 432 clear_pending_set_locked(lock); 433 lockevent_inc(lock_pending); 434 return 0; 435 436 /* 437 * End of pending bit optimistic spinning and beginning of MCS 438 * queuing. 439 */ 440 queue: 441 /* 442 * Do not queue if we're a waiter and someone is attempting this lock on 443 * the same CPU. In case of NMIs, this prevents long timeouts where we 444 * interrupt the pending waiter, and the owner, that will eventually 445 * signal the head of our queue, both of which are logically but not 446 * physically part of the queue, hence outside the scope of the idx > 0 447 * check above for the trylock fallback. 448 */ 449 if (check_deadlock_AA(lock)) { 450 ret = -EDEADLK; 451 goto err_release_entry; 452 } 453 454 lockevent_inc(lock_slowpath); 455 /* Deadlock detection entry already held after failing fast path. */ 456 node = this_cpu_ptr(&rqnodes[0].mcs); 457 idx = node->count++; 458 tail = encode_tail(smp_processor_id(), idx); 459 460 trace_contention_begin(lock, LCB_F_SPIN); 461 462 /* 463 * 4 nodes are allocated based on the assumption that there will 464 * not be nested NMIs taking spinlocks. That may not be true in 465 * some architectures even though the chance of needing more than 466 * 4 nodes will still be extremely unlikely. When that happens, 467 * we fall back to attempting a trylock operation without using 468 * any MCS node. Unlike qspinlock which cannot fail, we have the 469 * option of failing the slow path, and under contention, such a 470 * trylock spinning will likely be treated unfairly due to lack of 471 * queueing, hence do not spin. 472 */ 473 if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) { 474 lockevent_inc(lock_no_node); 475 if (!queued_spin_trylock(lock)) { 476 ret = -EDEADLK; 477 goto err_release_node; 478 } 479 goto release; 480 } 481 482 node = grab_mcs_node(node, idx); 483 484 /* 485 * Keep counts of non-zero index values: 486 */ 487 lockevent_cond_inc(lock_use_node2 + idx - 1, idx); 488 489 /* 490 * Ensure that we increment the head node->count before initialising 491 * the actual node. If the compiler is kind enough to reorder these 492 * stores, then an IRQ could overwrite our assignments. 493 */ 494 barrier(); 495 496 node->locked = 0; 497 node->next = NULL; 498 499 /* 500 * We touched a (possibly) cold cacheline in the per-cpu queue node; 501 * attempt the trylock once more in the hope someone let go while we 502 * weren't watching. 503 */ 504 if (queued_spin_trylock(lock)) 505 goto release; 506 507 /* 508 * Ensure that the initialisation of @node is complete before we 509 * publish the updated tail via xchg_tail() and potentially link 510 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. 511 */ 512 smp_wmb(); 513 514 /* 515 * Publish the updated tail. 516 * We have already touched the queueing cacheline; don't bother with 517 * pending stuff. 518 * 519 * p,*,* -> n,*,* 520 */ 521 old = xchg_tail(lock, tail); 522 next = NULL; 523 524 /* 525 * if there was a previous node; link it and wait until reaching the 526 * head of the waitqueue. 527 */ 528 if (old & _Q_TAIL_MASK) { 529 int val; 530 531 prev = decode_tail(old, rqnodes); 532 533 /* Link @node into the waitqueue. */ 534 WRITE_ONCE(prev->next, node); 535 536 val = arch_mcs_spin_lock_contended(&node->locked); 537 if (val == RES_TIMEOUT_VAL) { 538 ret = -ETIMEDOUT; 539 goto waitq_timeout; 540 } 541 542 /* 543 * While waiting for the MCS lock, the next pointer may have 544 * been set by another lock waiter. We optimistically load 545 * the next pointer & prefetch the cacheline for writing 546 * to reduce latency in the upcoming MCS unlock operation. 547 */ 548 next = READ_ONCE(node->next); 549 if (next) 550 prefetchw(next); 551 } 552 553 /* 554 * we're at the head of the waitqueue, wait for the owner & pending to 555 * go away. 556 * 557 * *,x,y -> *,0,0 558 * 559 * this wait loop must use a load-acquire such that we match the 560 * store-release that clears the locked bit and create lock 561 * sequentiality; this is because the set_locked() function below 562 * does not imply a full barrier. 563 * 564 * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is 565 * meant to span maximum allowed time per critical section, and we may 566 * have both the owner of the lock and the pending bit waiter ahead of 567 * us. 568 */ 569 RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2); 570 val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || 571 RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); 572 573 /* Disable queue destruction when we detect deadlocks. */ 574 if (ret == -EDEADLK) { 575 if (!next) 576 next = smp_cond_load_relaxed(&node->next, (VAL)); 577 arch_mcs_spin_unlock_contended(&next->locked); 578 goto err_release_node; 579 } 580 581 waitq_timeout: 582 if (ret) { 583 /* 584 * If the tail is still pointing to us, then we are the final waiter, 585 * and are responsible for resetting the tail back to 0. Otherwise, if 586 * the cmpxchg operation fails, we signal the next waiter to take exit 587 * and try the same. For a waiter with tail node 'n': 588 * 589 * n,*,* -> 0,*,* 590 * 591 * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is 592 * possible locked/pending bits keep changing and we see failures even 593 * when we remain the head of wait queue. However, eventually, 594 * pending bit owner will unset the pending bit, and new waiters 595 * will queue behind us. This will leave the lock owner in 596 * charge, and it will eventually either set locked bit to 0, or 597 * leave it as 1, allowing us to make progress. 598 * 599 * We terminate the whole wait queue for two reasons. Firstly, 600 * we eschew per-waiter timeouts with one applied at the head of 601 * the wait queue. This allows everyone to break out faster 602 * once we've seen the owner / pending waiter not responding for 603 * the timeout duration from the head. Secondly, it avoids 604 * complicated synchronization, because when not leaving in FIFO 605 * order, prev's next pointer needs to be fixed up etc. 606 */ 607 if (!try_cmpxchg_tail(lock, tail, 0)) { 608 next = smp_cond_load_relaxed(&node->next, VAL); 609 WRITE_ONCE(next->locked, RES_TIMEOUT_VAL); 610 } 611 lockevent_inc(rqspinlock_lock_timeout); 612 goto err_release_node; 613 } 614 615 /* 616 * claim the lock: 617 * 618 * n,0,0 -> 0,0,1 : lock, uncontended 619 * *,*,0 -> *,*,1 : lock, contended 620 * 621 * If the queue head is the only one in the queue (lock value == tail) 622 * and nobody is pending, clear the tail code and grab the lock. 623 * Otherwise, we only need to grab the lock. 624 */ 625 626 /* 627 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the 628 * above wait condition, therefore any concurrent setting of 629 * PENDING will make the uncontended transition fail. 630 */ 631 if ((val & _Q_TAIL_MASK) == tail) { 632 if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) 633 goto release; /* No contention */ 634 } 635 636 /* 637 * Either somebody is queued behind us or _Q_PENDING_VAL got set 638 * which will then detect the remaining tail and queue behind us 639 * ensuring we'll see a @next. 640 */ 641 set_locked(lock); 642 643 /* 644 * contended path; wait for next if not observed yet, release. 645 */ 646 if (!next) 647 next = smp_cond_load_relaxed(&node->next, (VAL)); 648 649 arch_mcs_spin_unlock_contended(&next->locked); 650 651 release: 652 trace_contention_end(lock, 0); 653 654 /* 655 * release the node 656 */ 657 __this_cpu_dec(rqnodes[0].mcs.count); 658 return ret; 659 err_release_node: 660 trace_contention_end(lock, ret); 661 __this_cpu_dec(rqnodes[0].mcs.count); 662 err_release_entry: 663 release_held_lock_entry(); 664 return ret; 665 } 666 EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); 667 668 #endif /* CONFIG_QUEUED_SPINLOCKS */ 669 670 __bpf_kfunc_start_defs(); 671 672 static void bpf_prog_report_rqspinlock_violation(const char *str, void *lock, bool irqsave) 673 { 674 struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 675 struct bpf_stream_stage ss; 676 struct bpf_prog *prog; 677 678 prog = bpf_prog_find_from_stack(); 679 if (!prog) 680 return; 681 bpf_stream_stage(ss, prog, BPF_STDERR, ({ 682 bpf_stream_printk(ss, "ERROR: %s for bpf_res_spin_lock%s\n", str, irqsave ? "_irqsave" : ""); 683 bpf_stream_printk(ss, "Attempted lock = 0x%px\n", lock); 684 bpf_stream_printk(ss, "Total held locks = %d\n", rqh->cnt); 685 for (int i = 0; i < min(RES_NR_HELD, rqh->cnt); i++) 686 bpf_stream_printk(ss, "Held lock[%2d] = 0x%px\n", i, rqh->locks[i]); 687 bpf_stream_dump_stack(ss); 688 })); 689 } 690 691 #define REPORT_STR(ret) ({ (ret) == -ETIMEDOUT ? "Timeout detected" : "AA or ABBA deadlock detected"; }) 692 693 __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) 694 { 695 int ret; 696 697 BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock)); 698 699 preempt_disable(); 700 ret = res_spin_lock((rqspinlock_t *)lock); 701 if (unlikely(ret)) { 702 bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, false); 703 preempt_enable(); 704 return ret; 705 } 706 return 0; 707 } 708 709 __bpf_kfunc void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) 710 { 711 res_spin_unlock((rqspinlock_t *)lock); 712 preempt_enable(); 713 } 714 715 __bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) 716 { 717 u64 *ptr = (u64 *)flags__irq_flag; 718 unsigned long flags; 719 int ret; 720 721 preempt_disable(); 722 local_irq_save(flags); 723 ret = res_spin_lock((rqspinlock_t *)lock); 724 if (unlikely(ret)) { 725 bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, true); 726 local_irq_restore(flags); 727 preempt_enable(); 728 return ret; 729 } 730 *ptr = flags; 731 return 0; 732 } 733 734 __bpf_kfunc void bpf_res_spin_unlock_irqrestore(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) 735 { 736 u64 *ptr = (u64 *)flags__irq_flag; 737 unsigned long flags = *ptr; 738 739 res_spin_unlock((rqspinlock_t *)lock); 740 local_irq_restore(flags); 741 preempt_enable(); 742 } 743 744 __bpf_kfunc_end_defs(); 745 746 BTF_KFUNCS_START(rqspinlock_kfunc_ids) 747 BTF_ID_FLAGS(func, bpf_res_spin_lock, KF_RET_NULL) 748 BTF_ID_FLAGS(func, bpf_res_spin_unlock) 749 BTF_ID_FLAGS(func, bpf_res_spin_lock_irqsave, KF_RET_NULL) 750 BTF_ID_FLAGS(func, bpf_res_spin_unlock_irqrestore) 751 BTF_KFUNCS_END(rqspinlock_kfunc_ids) 752 753 static const struct btf_kfunc_id_set rqspinlock_kfunc_set = { 754 .owner = THIS_MODULE, 755 .set = &rqspinlock_kfunc_ids, 756 }; 757 758 static __init int rqspinlock_register_kfuncs(void) 759 { 760 return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &rqspinlock_kfunc_set); 761 } 762 late_initcall(rqspinlock_register_kfuncs); 763