1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Resilient Queued Spin Lock 4 * 5 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. 6 * (C) Copyright 2013-2014,2018 Red Hat, Inc. 7 * (C) Copyright 2015 Intel Corp. 8 * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP 9 * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. 10 * 11 * Authors: Waiman Long <longman@redhat.com> 12 * Peter Zijlstra <peterz@infradead.org> 13 * Kumar Kartikeya Dwivedi <memxor@gmail.com> 14 */ 15 16 #include <linux/smp.h> 17 #include <linux/bug.h> 18 #include <linux/bpf.h> 19 #include <linux/err.h> 20 #include <linux/cpumask.h> 21 #include <linux/percpu.h> 22 #include <linux/hardirq.h> 23 #include <linux/mutex.h> 24 #include <linux/prefetch.h> 25 #include <asm/byteorder.h> 26 #ifdef CONFIG_QUEUED_SPINLOCKS 27 #include <asm/qspinlock.h> 28 #endif 29 #include <trace/events/lock.h> 30 #include <asm/rqspinlock.h> 31 #include <linux/timekeeping.h> 32 33 /* 34 * Include queued spinlock definitions and statistics code 35 */ 36 #ifdef CONFIG_QUEUED_SPINLOCKS 37 #include "../locking/qspinlock.h" 38 #include "../locking/lock_events.h" 39 #include "rqspinlock.h" 40 #include "../locking/mcs_spinlock.h" 41 #endif 42 43 /* 44 * The basic principle of a queue-based spinlock can best be understood 45 * by studying a classic queue-based spinlock implementation called the 46 * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable 47 * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and 48 * Scott") is available at 49 * 50 * https://bugzilla.kernel.org/show_bug.cgi?id=206115 51 * 52 * This queued spinlock implementation is based on the MCS lock, however to 53 * make it fit the 4 bytes we assume spinlock_t to be, and preserve its 54 * existing API, we must modify it somehow. 55 * 56 * In particular; where the traditional MCS lock consists of a tail pointer 57 * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to 58 * unlock the next pending (next->locked), we compress both these: {tail, 59 * next->locked} into a single u32 value. 60 * 61 * Since a spinlock disables recursion of its own context and there is a limit 62 * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there 63 * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now 64 * we can encode the tail by combining the 2-bit nesting level with the cpu 65 * number. With one byte for the lock value and 3 bytes for the tail, only a 66 * 32-bit word is now needed. Even though we only need 1 bit for the lock, 67 * we extend it to a full byte to achieve better performance for architectures 68 * that support atomic byte write. 69 * 70 * We also change the first spinner to spin on the lock bit instead of its 71 * node; whereby avoiding the need to carry a node from lock to unlock, and 72 * preserving existing lock API. This also makes the unlock code simpler and 73 * faster. 74 * 75 * N.B. The current implementation only supports architectures that allow 76 * atomic operations on smaller 8-bit and 16-bit data types. 77 * 78 */ 79 80 struct rqspinlock_timeout { 81 u64 timeout_end; 82 u64 duration; 83 u64 cur; 84 u16 spin; 85 }; 86 87 #define RES_TIMEOUT_VAL 2 88 89 DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); 90 EXPORT_SYMBOL_GPL(rqspinlock_held_locks); 91 92 static bool is_lock_released(rqspinlock_t *lock, u32 mask) 93 { 94 if (!(atomic_read_acquire(&lock->val) & (mask))) 95 return true; 96 return false; 97 } 98 99 static noinline int check_deadlock_AA(rqspinlock_t *lock) 100 { 101 struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 102 int cnt = min(RES_NR_HELD, rqh->cnt); 103 104 /* 105 * Return an error if we hold the lock we are attempting to acquire. 106 * We'll iterate over max 32 locks; no need to do is_lock_released. 107 */ 108 for (int i = 0; i < cnt - 1; i++) { 109 if (rqh->locks[i] == lock) 110 return -EDEADLK; 111 } 112 return 0; 113 } 114 115 /* 116 * This focuses on the most common case of ABBA deadlocks (or ABBA involving 117 * more locks, which reduce to ABBA). This is not exhaustive, and we rely on 118 * timeouts as the final line of defense. 119 */ 120 static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask) 121 { 122 struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 123 int rqh_cnt = min(RES_NR_HELD, rqh->cnt); 124 void *remote_lock; 125 int cpu; 126 127 /* 128 * Find the CPU holding the lock that we want to acquire. If there is a 129 * deadlock scenario, we will read a stable set on the remote CPU and 130 * find the target. This would be a constant time operation instead of 131 * O(NR_CPUS) if we could determine the owning CPU from a lock value, but 132 * that requires increasing the size of the lock word. 133 */ 134 for_each_possible_cpu(cpu) { 135 struct rqspinlock_held *rqh_cpu = per_cpu_ptr(&rqspinlock_held_locks, cpu); 136 int real_cnt = READ_ONCE(rqh_cpu->cnt); 137 int cnt = min(RES_NR_HELD, real_cnt); 138 139 /* 140 * Let's ensure to break out of this loop if the lock is available for 141 * us to potentially acquire. 142 */ 143 if (is_lock_released(lock, mask)) 144 return 0; 145 146 /* 147 * Skip ourselves, and CPUs whose count is less than 2, as they need at 148 * least one held lock and one acquisition attempt (reflected as top 149 * most entry) to participate in an ABBA deadlock. 150 * 151 * If cnt is more than RES_NR_HELD, it means the current lock being 152 * acquired won't appear in the table, and other locks in the table are 153 * already held, so we can't determine ABBA. 154 */ 155 if (cpu == smp_processor_id() || real_cnt < 2 || real_cnt > RES_NR_HELD) 156 continue; 157 158 /* 159 * Obtain the entry at the top, this corresponds to the lock the 160 * remote CPU is attempting to acquire in a deadlock situation, 161 * and would be one of the locks we hold on the current CPU. 162 */ 163 remote_lock = READ_ONCE(rqh_cpu->locks[cnt - 1]); 164 /* 165 * If it is NULL, we've raced and cannot determine a deadlock 166 * conclusively, skip this CPU. 167 */ 168 if (!remote_lock) 169 continue; 170 /* 171 * Find if the lock we're attempting to acquire is held by this CPU. 172 * Don't consider the topmost entry, as that must be the latest lock 173 * being held or acquired. For a deadlock, the target CPU must also 174 * attempt to acquire a lock we hold, so for this search only 'cnt - 1' 175 * entries are important. 176 */ 177 for (int i = 0; i < cnt - 1; i++) { 178 if (READ_ONCE(rqh_cpu->locks[i]) != lock) 179 continue; 180 /* 181 * We found our lock as held on the remote CPU. Is the 182 * acquisition attempt on the remote CPU for a lock held 183 * by us? If so, we have a deadlock situation, and need 184 * to recover. 185 */ 186 for (int i = 0; i < rqh_cnt - 1; i++) { 187 if (rqh->locks[i] == remote_lock) 188 return -EDEADLK; 189 } 190 /* 191 * Inconclusive; retry again later. 192 */ 193 return 0; 194 } 195 } 196 return 0; 197 } 198 199 static noinline int check_deadlock(rqspinlock_t *lock, u32 mask) 200 { 201 int ret; 202 203 ret = check_deadlock_AA(lock); 204 if (ret) 205 return ret; 206 ret = check_deadlock_ABBA(lock, mask); 207 if (ret) 208 return ret; 209 210 return 0; 211 } 212 213 static noinline int check_timeout(rqspinlock_t *lock, u32 mask, 214 struct rqspinlock_timeout *ts) 215 { 216 u64 time = ktime_get_mono_fast_ns(); 217 u64 prev = ts->cur; 218 219 if (!ts->timeout_end) { 220 ts->cur = time; 221 ts->timeout_end = time + ts->duration; 222 return 0; 223 } 224 225 if (time > ts->timeout_end) 226 return -ETIMEDOUT; 227 228 /* 229 * A millisecond interval passed from last time? Trigger deadlock 230 * checks. 231 */ 232 if (prev + NSEC_PER_MSEC < time) { 233 ts->cur = time; 234 return check_deadlock(lock, mask); 235 } 236 237 return 0; 238 } 239 240 /* 241 * Do not amortize with spins when res_smp_cond_load_acquire is defined, 242 * as the macro does internal amortization for us. 243 */ 244 #ifndef res_smp_cond_load_acquire 245 #define RES_CHECK_TIMEOUT(ts, ret, mask) \ 246 ({ \ 247 if (!(ts).spin++) \ 248 (ret) = check_timeout((lock), (mask), &(ts)); \ 249 (ret); \ 250 }) 251 #else 252 #define RES_CHECK_TIMEOUT(ts, ret, mask) \ 253 ({ (ret) = check_timeout((lock), (mask), &(ts)); }) 254 #endif 255 256 /* 257 * Initialize the 'spin' member. 258 * Set spin member to 0 to trigger AA/ABBA checks immediately. 259 */ 260 #define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; }) 261 262 /* 263 * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary. 264 * Duration is defined for each spin attempt, so set it here. 265 */ 266 #define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; }) 267 268 /* 269 * Provide a test-and-set fallback for cases when queued spin lock support is 270 * absent from the architecture. 271 */ 272 int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) 273 { 274 struct rqspinlock_timeout ts; 275 int val, ret = 0; 276 277 RES_INIT_TIMEOUT(ts); 278 grab_held_lock_entry(lock); 279 280 /* 281 * Since the waiting loop's time is dependent on the amount of 282 * contention, a short timeout unlike rqspinlock waiting loops 283 * isn't enough. Choose a second as the timeout value. 284 */ 285 RES_RESET_TIMEOUT(ts, NSEC_PER_SEC); 286 retry: 287 val = atomic_read(&lock->val); 288 289 if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) { 290 if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) 291 goto out; 292 cpu_relax(); 293 goto retry; 294 } 295 296 return 0; 297 out: 298 release_held_lock_entry(); 299 return ret; 300 } 301 EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); 302 303 #ifdef CONFIG_QUEUED_SPINLOCKS 304 305 /* 306 * Per-CPU queue node structures; we can never have more than 4 nested 307 * contexts: task, softirq, hardirq, nmi. 308 * 309 * Exactly fits one 64-byte cacheline on a 64-bit architecture. 310 */ 311 static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); 312 313 #ifndef res_smp_cond_load_acquire 314 #define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) 315 #endif 316 317 #define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) 318 319 /** 320 * resilient_queued_spin_lock_slowpath - acquire the queued spinlock 321 * @lock: Pointer to queued spinlock structure 322 * @val: Current value of the queued spinlock 32-bit word 323 * 324 * Return: 325 * * 0 - Lock was acquired successfully. 326 * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. 327 * * -ETIMEDOUT - Lock acquisition failed because of timeout. 328 * 329 * (queue tail, pending bit, lock value) 330 * 331 * fast : slow : unlock 332 * : : 333 * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) 334 * : | ^--------.------. / : 335 * : v \ \ | : 336 * pending : (0,1,1) +--> (0,1,0) \ | : 337 * : | ^--' | | : 338 * : v | | : 339 * uncontended : (n,x,y) +--> (n,0,0) --' | : 340 * queue : | ^--' | : 341 * : v | : 342 * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : 343 * queue : ^--' : 344 */ 345 int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) 346 { 347 struct mcs_spinlock *prev, *next, *node; 348 struct rqspinlock_timeout ts; 349 int idx, ret = 0; 350 u32 old, tail; 351 352 BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); 353 354 if (resilient_virt_spin_lock_enabled()) 355 return resilient_virt_spin_lock(lock); 356 357 RES_INIT_TIMEOUT(ts); 358 359 /* 360 * Wait for in-progress pending->locked hand-overs with a bounded 361 * number of spins so that we guarantee forward progress. 362 * 363 * 0,1,0 -> 0,0,1 364 */ 365 if (val == _Q_PENDING_VAL) { 366 int cnt = _Q_PENDING_LOOPS; 367 val = atomic_cond_read_relaxed(&lock->val, 368 (VAL != _Q_PENDING_VAL) || !cnt--); 369 } 370 371 /* 372 * If we observe any contention; queue. 373 */ 374 if (val & ~_Q_LOCKED_MASK) 375 goto queue; 376 377 /* 378 * trylock || pending 379 * 380 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock 381 */ 382 val = queued_fetch_set_pending_acquire(lock); 383 384 /* 385 * If we observe contention, there is a concurrent locker. 386 * 387 * Undo and queue; our setting of PENDING might have made the 388 * n,0,0 -> 0,0,0 transition fail and it will now be waiting 389 * on @next to become !NULL. 390 */ 391 if (unlikely(val & ~_Q_LOCKED_MASK)) { 392 393 /* Undo PENDING if we set it. */ 394 if (!(val & _Q_PENDING_MASK)) 395 clear_pending(lock); 396 397 goto queue; 398 } 399 400 /* 401 * Grab an entry in the held locks array, to enable deadlock detection. 402 */ 403 grab_held_lock_entry(lock); 404 405 /* 406 * We're pending, wait for the owner to go away. 407 * 408 * 0,1,1 -> *,1,0 409 * 410 * this wait loop must be a load-acquire such that we match the 411 * store-release that clears the locked bit and create lock 412 * sequentiality; this is because not all 413 * clear_pending_set_locked() implementations imply full 414 * barriers. 415 */ 416 if (val & _Q_LOCKED_MASK) { 417 RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); 418 res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK)); 419 } 420 421 if (ret) { 422 /* 423 * We waited for the locked bit to go back to 0, as the pending 424 * waiter, but timed out. We need to clear the pending bit since 425 * we own it. Once a stuck owner has been recovered, the lock 426 * must be restored to a valid state, hence removing the pending 427 * bit is necessary. 428 * 429 * *,1,* -> *,0,* 430 */ 431 clear_pending(lock); 432 lockevent_inc(rqspinlock_lock_timeout); 433 goto err_release_entry; 434 } 435 436 /* 437 * take ownership and clear the pending bit. 438 * 439 * 0,1,0 -> 0,0,1 440 */ 441 clear_pending_set_locked(lock); 442 lockevent_inc(lock_pending); 443 return 0; 444 445 /* 446 * End of pending bit optimistic spinning and beginning of MCS 447 * queuing. 448 */ 449 queue: 450 lockevent_inc(lock_slowpath); 451 /* 452 * Grab deadlock detection entry for the queue path. 453 */ 454 grab_held_lock_entry(lock); 455 456 node = this_cpu_ptr(&rqnodes[0].mcs); 457 idx = node->count++; 458 tail = encode_tail(smp_processor_id(), idx); 459 460 trace_contention_begin(lock, LCB_F_SPIN); 461 462 /* 463 * 4 nodes are allocated based on the assumption that there will 464 * not be nested NMIs taking spinlocks. That may not be true in 465 * some architectures even though the chance of needing more than 466 * 4 nodes will still be extremely unlikely. When that happens, 467 * we fall back to spinning on the lock directly without using 468 * any MCS node. This is not the most elegant solution, but is 469 * simple enough. 470 */ 471 if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) { 472 lockevent_inc(lock_no_node); 473 RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); 474 while (!queued_spin_trylock(lock)) { 475 if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) { 476 lockevent_inc(rqspinlock_lock_timeout); 477 goto err_release_node; 478 } 479 cpu_relax(); 480 } 481 goto release; 482 } 483 484 node = grab_mcs_node(node, idx); 485 486 /* 487 * Keep counts of non-zero index values: 488 */ 489 lockevent_cond_inc(lock_use_node2 + idx - 1, idx); 490 491 /* 492 * Ensure that we increment the head node->count before initialising 493 * the actual node. If the compiler is kind enough to reorder these 494 * stores, then an IRQ could overwrite our assignments. 495 */ 496 barrier(); 497 498 node->locked = 0; 499 node->next = NULL; 500 501 /* 502 * We touched a (possibly) cold cacheline in the per-cpu queue node; 503 * attempt the trylock once more in the hope someone let go while we 504 * weren't watching. 505 */ 506 if (queued_spin_trylock(lock)) 507 goto release; 508 509 /* 510 * Ensure that the initialisation of @node is complete before we 511 * publish the updated tail via xchg_tail() and potentially link 512 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. 513 */ 514 smp_wmb(); 515 516 /* 517 * Publish the updated tail. 518 * We have already touched the queueing cacheline; don't bother with 519 * pending stuff. 520 * 521 * p,*,* -> n,*,* 522 */ 523 old = xchg_tail(lock, tail); 524 next = NULL; 525 526 /* 527 * if there was a previous node; link it and wait until reaching the 528 * head of the waitqueue. 529 */ 530 if (old & _Q_TAIL_MASK) { 531 int val; 532 533 prev = decode_tail(old, rqnodes); 534 535 /* Link @node into the waitqueue. */ 536 WRITE_ONCE(prev->next, node); 537 538 val = arch_mcs_spin_lock_contended(&node->locked); 539 if (val == RES_TIMEOUT_VAL) { 540 ret = -ETIMEDOUT; 541 goto waitq_timeout; 542 } 543 544 /* 545 * While waiting for the MCS lock, the next pointer may have 546 * been set by another lock waiter. We optimistically load 547 * the next pointer & prefetch the cacheline for writing 548 * to reduce latency in the upcoming MCS unlock operation. 549 */ 550 next = READ_ONCE(node->next); 551 if (next) 552 prefetchw(next); 553 } 554 555 /* 556 * we're at the head of the waitqueue, wait for the owner & pending to 557 * go away. 558 * 559 * *,x,y -> *,0,0 560 * 561 * this wait loop must use a load-acquire such that we match the 562 * store-release that clears the locked bit and create lock 563 * sequentiality; this is because the set_locked() function below 564 * does not imply a full barrier. 565 * 566 * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is 567 * meant to span maximum allowed time per critical section, and we may 568 * have both the owner of the lock and the pending bit waiter ahead of 569 * us. 570 */ 571 RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2); 572 val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || 573 RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); 574 575 /* Disable queue destruction when we detect deadlocks. */ 576 if (ret == -EDEADLK) { 577 if (!next) 578 next = smp_cond_load_relaxed(&node->next, (VAL)); 579 arch_mcs_spin_unlock_contended(&next->locked); 580 goto err_release_node; 581 } 582 583 waitq_timeout: 584 if (ret) { 585 /* 586 * If the tail is still pointing to us, then we are the final waiter, 587 * and are responsible for resetting the tail back to 0. Otherwise, if 588 * the cmpxchg operation fails, we signal the next waiter to take exit 589 * and try the same. For a waiter with tail node 'n': 590 * 591 * n,*,* -> 0,*,* 592 * 593 * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is 594 * possible locked/pending bits keep changing and we see failures even 595 * when we remain the head of wait queue. However, eventually, 596 * pending bit owner will unset the pending bit, and new waiters 597 * will queue behind us. This will leave the lock owner in 598 * charge, and it will eventually either set locked bit to 0, or 599 * leave it as 1, allowing us to make progress. 600 * 601 * We terminate the whole wait queue for two reasons. Firstly, 602 * we eschew per-waiter timeouts with one applied at the head of 603 * the wait queue. This allows everyone to break out faster 604 * once we've seen the owner / pending waiter not responding for 605 * the timeout duration from the head. Secondly, it avoids 606 * complicated synchronization, because when not leaving in FIFO 607 * order, prev's next pointer needs to be fixed up etc. 608 */ 609 if (!try_cmpxchg_tail(lock, tail, 0)) { 610 next = smp_cond_load_relaxed(&node->next, VAL); 611 WRITE_ONCE(next->locked, RES_TIMEOUT_VAL); 612 } 613 lockevent_inc(rqspinlock_lock_timeout); 614 goto err_release_node; 615 } 616 617 /* 618 * claim the lock: 619 * 620 * n,0,0 -> 0,0,1 : lock, uncontended 621 * *,*,0 -> *,*,1 : lock, contended 622 * 623 * If the queue head is the only one in the queue (lock value == tail) 624 * and nobody is pending, clear the tail code and grab the lock. 625 * Otherwise, we only need to grab the lock. 626 */ 627 628 /* 629 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the 630 * above wait condition, therefore any concurrent setting of 631 * PENDING will make the uncontended transition fail. 632 */ 633 if ((val & _Q_TAIL_MASK) == tail) { 634 if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) 635 goto release; /* No contention */ 636 } 637 638 /* 639 * Either somebody is queued behind us or _Q_PENDING_VAL got set 640 * which will then detect the remaining tail and queue behind us 641 * ensuring we'll see a @next. 642 */ 643 set_locked(lock); 644 645 /* 646 * contended path; wait for next if not observed yet, release. 647 */ 648 if (!next) 649 next = smp_cond_load_relaxed(&node->next, (VAL)); 650 651 arch_mcs_spin_unlock_contended(&next->locked); 652 653 release: 654 trace_contention_end(lock, 0); 655 656 /* 657 * release the node 658 */ 659 __this_cpu_dec(rqnodes[0].mcs.count); 660 return ret; 661 err_release_node: 662 trace_contention_end(lock, ret); 663 __this_cpu_dec(rqnodes[0].mcs.count); 664 err_release_entry: 665 release_held_lock_entry(); 666 return ret; 667 } 668 EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); 669 670 #endif /* CONFIG_QUEUED_SPINLOCKS */ 671 672 __bpf_kfunc_start_defs(); 673 674 static void bpf_prog_report_rqspinlock_violation(const char *str, void *lock, bool irqsave) 675 { 676 struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 677 struct bpf_stream_stage ss; 678 struct bpf_prog *prog; 679 680 prog = bpf_prog_find_from_stack(); 681 if (!prog) 682 return; 683 bpf_stream_stage(ss, prog, BPF_STDERR, ({ 684 bpf_stream_printk(ss, "ERROR: %s for bpf_res_spin_lock%s\n", str, irqsave ? "_irqsave" : ""); 685 bpf_stream_printk(ss, "Attempted lock = 0x%px\n", lock); 686 bpf_stream_printk(ss, "Total held locks = %d\n", rqh->cnt); 687 for (int i = 0; i < min(RES_NR_HELD, rqh->cnt); i++) 688 bpf_stream_printk(ss, "Held lock[%2d] = 0x%px\n", i, rqh->locks[i]); 689 bpf_stream_dump_stack(ss); 690 })); 691 } 692 693 #define REPORT_STR(ret) ({ (ret) == -ETIMEDOUT ? "Timeout detected" : "AA or ABBA deadlock detected"; }) 694 695 __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) 696 { 697 int ret; 698 699 BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock)); 700 BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock)); 701 702 preempt_disable(); 703 ret = res_spin_lock((rqspinlock_t *)lock); 704 if (unlikely(ret)) { 705 bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, false); 706 preempt_enable(); 707 return ret; 708 } 709 return 0; 710 } 711 712 __bpf_kfunc void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) 713 { 714 res_spin_unlock((rqspinlock_t *)lock); 715 preempt_enable(); 716 } 717 718 __bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) 719 { 720 u64 *ptr = (u64 *)flags__irq_flag; 721 unsigned long flags; 722 int ret; 723 724 preempt_disable(); 725 local_irq_save(flags); 726 ret = res_spin_lock((rqspinlock_t *)lock); 727 if (unlikely(ret)) { 728 bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, true); 729 local_irq_restore(flags); 730 preempt_enable(); 731 return ret; 732 } 733 *ptr = flags; 734 return 0; 735 } 736 737 __bpf_kfunc void bpf_res_spin_unlock_irqrestore(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) 738 { 739 u64 *ptr = (u64 *)flags__irq_flag; 740 unsigned long flags = *ptr; 741 742 res_spin_unlock((rqspinlock_t *)lock); 743 local_irq_restore(flags); 744 preempt_enable(); 745 } 746 747 __bpf_kfunc_end_defs(); 748 749 BTF_KFUNCS_START(rqspinlock_kfunc_ids) 750 BTF_ID_FLAGS(func, bpf_res_spin_lock, KF_RET_NULL) 751 BTF_ID_FLAGS(func, bpf_res_spin_unlock) 752 BTF_ID_FLAGS(func, bpf_res_spin_lock_irqsave, KF_RET_NULL) 753 BTF_ID_FLAGS(func, bpf_res_spin_unlock_irqrestore) 754 BTF_KFUNCS_END(rqspinlock_kfunc_ids) 755 756 static const struct btf_kfunc_id_set rqspinlock_kfunc_set = { 757 .owner = THIS_MODULE, 758 .set = &rqspinlock_kfunc_ids, 759 }; 760 761 static __init int rqspinlock_register_kfuncs(void) 762 { 763 return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &rqspinlock_kfunc_set); 764 } 765 late_initcall(rqspinlock_register_kfuncs); 766